howard.objects.variants
1import csv 2import gc 3import gzip 4import io 5import multiprocessing 6import os 7import random 8import re 9import shlex 10import sqlite3 11import subprocess 12from tempfile import NamedTemporaryFile, TemporaryDirectory 13import tempfile 14import duckdb 15import json 16import yaml 17import argparse 18import Bio.bgzf as bgzf 19import pandas as pd 20from pyfaidx import Fasta 21import numpy as np 22import vcf 23import logging as log 24import fastparquet as fp 25from multiprocesspandas import applyparallel 26 27from howard.functions.commons import * 28from howard.objects.database import * 29from howard.functions.databases import * 30from howard.functions.utils import * 31 32 33class Variants: 34 35 def __init__( 36 self, 37 conn=None, 38 input: str = None, 39 output: str = None, 40 config: dict = {}, 41 param: dict = {}, 42 load: bool = False, 43 ) -> None: 44 """ 45 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 46 header 47 48 :param conn: the connection to the database 49 :param input: the input file 50 :param output: the output file 51 :param config: a dictionary containing the configuration of the model 52 :param param: a dictionary containing the parameters of the model 53 """ 54 55 # Init variables 56 self.init_variables() 57 58 # Input 59 self.set_input(input) 60 61 # Config 62 self.set_config(config) 63 64 # Param 65 self.set_param(param) 66 67 # Output 68 self.set_output(output) 69 70 # connexion 71 self.set_connexion(conn) 72 73 # Header 74 self.set_header() 75 76 # Load data 77 if load: 78 self.load_data() 79 80 def set_input(self, input: str = None) -> None: 81 """ 82 The function takes a file name as input, splits the file name into a name and an extension, and 83 then sets the input_name, input_extension, and input_format attributes of the class 84 85 :param input: The input file 86 """ 87 88 if input and not isinstance(input, str): 89 try: 90 self.input = input.name 91 except: 92 log.error(f"Input file '{input} in bad format") 93 raise ValueError(f"Input file '{input} in bad format") 94 else: 95 self.input = input 96 97 # Input format 98 if input: 99 input_name, input_extension = os.path.splitext(self.input) 100 self.input_name = input_name 101 self.input_extension = input_extension 102 self.input_format = self.input_extension.replace(".", "") 103 104 def set_config(self, config: dict) -> None: 105 """ 106 This function takes in a config object and sets it as the config object for the class 107 108 :param config: The configuration object 109 """ 110 self.config = config 111 112 def set_param(self, param: dict) -> None: 113 """ 114 This function takes in a param object and sets it as the param object for the class 115 116 :param param: The paramters object 117 """ 118 self.param = param 119 120 def init_variables(self) -> None: 121 """ 122 This function initializes the variables that will be used in the rest of the class 123 """ 124 self.prefix = "howard" 125 self.table_variants = "variants" 126 self.dataframe = None 127 128 self.comparison_map = { 129 "gt": ">", 130 "gte": ">=", 131 "lt": "<", 132 "lte": "<=", 133 "equals": "=", 134 "contains": "SIMILAR TO", 135 } 136 137 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 138 139 self.code_type_map_to_sql = { 140 "Integer": "INTEGER", 141 "String": "VARCHAR", 142 "Float": "FLOAT", 143 "Flag": "VARCHAR", 144 } 145 146 self.index_additionnal_fields = [] 147 148 def get_indexing(self) -> bool: 149 """ 150 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 151 returns False. 152 :return: The value of the indexing parameter. 153 """ 154 return self.get_param().get("indexing", False) 155 156 def get_connexion_config(self) -> dict: 157 """ 158 The function `get_connexion_config` returns a dictionary containing the configuration for a 159 connection, including the number of threads and memory limit. 160 :return: a dictionary containing the configuration for the Connexion library. 161 """ 162 163 # config 164 config = self.get_config() 165 166 # Connexion config 167 connexion_config = {} 168 threads = self.get_threads() 169 170 # Threads 171 if threads: 172 connexion_config["threads"] = threads 173 174 # Memory 175 # if config.get("memory", None): 176 # connexion_config["memory_limit"] = config.get("memory") 177 if self.get_memory(): 178 connexion_config["memory_limit"] = self.get_memory() 179 180 # Temporary directory 181 if config.get("tmp", None): 182 connexion_config["temp_directory"] = config.get("tmp") 183 184 # Access 185 if config.get("access", None): 186 access = config.get("access") 187 if access in ["RO"]: 188 access = "READ_ONLY" 189 elif access in ["RW"]: 190 access = "READ_WRITE" 191 connexion_db = self.get_connexion_db() 192 if connexion_db in ":memory:": 193 access = "READ_WRITE" 194 connexion_config["access_mode"] = access 195 196 return connexion_config 197 198 def get_duckdb_settings(self) -> dict: 199 """ 200 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 201 string. 202 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 203 """ 204 205 # config 206 config = self.get_config() 207 208 # duckdb settings 209 duckdb_settings_dict = {} 210 if config.get("duckdb_settings", None): 211 duckdb_settings = config.get("duckdb_settings") 212 duckdb_settings = full_path(duckdb_settings) 213 # duckdb setting is a file 214 if os.path.exists(duckdb_settings): 215 with open(duckdb_settings) as json_file: 216 duckdb_settings_dict = yaml.safe_load(json_file) 217 # duckdb settings is a string 218 else: 219 duckdb_settings_dict = json.loads(duckdb_settings) 220 221 return duckdb_settings_dict 222 223 def set_connexion_db(self) -> str: 224 """ 225 The function `set_connexion_db` returns the appropriate database connection string based on the 226 input format and connection type. 227 :return: the value of the variable `connexion_db`. 228 """ 229 230 # Default connexion db 231 default_connexion_db = ":memory:" 232 233 # Find connexion db 234 if self.get_input_format() in ["db", "duckdb"]: 235 connexion_db = self.get_input() 236 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 237 connexion_db = default_connexion_db 238 elif self.get_connexion_type() in ["tmpfile"]: 239 tmp_name = tempfile.mkdtemp( 240 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 241 ) 242 connexion_db = f"{tmp_name}/tmp.db" 243 elif self.get_connexion_type() != "": 244 connexion_db = self.get_connexion_type() 245 else: 246 connexion_db = default_connexion_db 247 248 # Set connexion db 249 self.connexion_db = connexion_db 250 251 return connexion_db 252 253 def set_connexion(self, conn) -> None: 254 """ 255 It creates a connection to the database 256 257 :param conn: The connection to the database. If not provided, a new connection to an in-memory 258 database is created 259 """ 260 261 # Connexion db 262 connexion_db = self.set_connexion_db() 263 264 # Connexion config 265 connexion_config = self.get_connexion_config() 266 267 # Connexion format 268 connexion_format = self.get_config().get("connexion_format", "duckdb") 269 # Set connexion format 270 self.connexion_format = connexion_format 271 272 # Connexion 273 if not conn: 274 if connexion_format in ["duckdb"]: 275 conn = duckdb.connect(connexion_db, config=connexion_config) 276 # duckDB settings 277 duckdb_settings = self.get_duckdb_settings() 278 if duckdb_settings: 279 for setting in duckdb_settings: 280 setting_value = duckdb_settings.get(setting) 281 if isinstance(setting_value, str): 282 setting_value = f"'{setting_value}'" 283 conn.execute(f"PRAGMA {setting}={setting_value};") 284 elif connexion_format in ["sqlite"]: 285 conn = sqlite3.connect(connexion_db) 286 287 # Set connexion 288 self.conn = conn 289 290 # Log 291 log.debug(f"connexion_format: {connexion_format}") 292 log.debug(f"connexion_db: {connexion_db}") 293 log.debug(f"connexion config: {connexion_config}") 294 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}") 295 296 def set_output(self, output: str = None) -> None: 297 """ 298 If the config file has an output key, set the output to the value of that key. Otherwise, set 299 the output to the input 300 301 :param output: The name of the output file 302 """ 303 304 if output and not isinstance(output, str): 305 self.output = output.name 306 else: 307 self.output = output 308 309 # Output format 310 if self.output: 311 output_name, output_extension = os.path.splitext(self.output) 312 self.output_name = output_name 313 self.output_extension = output_extension 314 self.output_format = self.output_extension.replace(".", "") 315 else: 316 self.output_name = None 317 self.output_extension = None 318 self.output_format = None 319 320 def set_header(self) -> None: 321 """ 322 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 323 """ 324 325 input_file = self.get_input() 326 default_header_list = [ 327 "##fileformat=VCFv4.2", 328 "#CHROM POS ID REF ALT QUAL FILTER INFO", 329 ] 330 331 # Full path 332 input_file = full_path(input_file) 333 334 if input_file: 335 336 input_format = self.get_input_format() 337 input_compressed = self.get_input_compressed() 338 config = self.get_config() 339 header_list = default_header_list 340 if input_format in [ 341 "vcf", 342 "hdr", 343 "tsv", 344 "csv", 345 "psv", 346 "parquet", 347 "db", 348 "duckdb", 349 ]: 350 # header provided in param 351 if config.get("header_file", None): 352 with open(config.get("header_file"), "rt") as f: 353 header_list = self.read_vcf_header(f) 354 # within a vcf file format (header within input file itsself) 355 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 356 # within a compressed vcf file format (.vcf.gz) 357 if input_compressed: 358 with bgzf.open(input_file, "rt") as f: 359 header_list = self.read_vcf_header(f) 360 # within an uncompressed vcf file format (.vcf) 361 else: 362 with open(input_file, "rt") as f: 363 header_list = self.read_vcf_header(f) 364 # header provided in default external file .hdr 365 elif os.path.exists((input_file + ".hdr")): 366 with open(input_file + ".hdr", "rt") as f: 367 header_list = self.read_vcf_header(f) 368 else: 369 try: # Try to get header info fields and file columns 370 371 with tempfile.TemporaryDirectory() as tmpdir: 372 373 # Create database 374 db_for_header = Database(database=input_file) 375 376 # Get header columns for infos fields 377 db_header_from_columns = ( 378 db_for_header.get_header_from_columns() 379 ) 380 381 # Get real columns in the file 382 db_header_columns = db_for_header.get_columns() 383 384 # Write header file 385 header_file_tmp = os.path.join(tmpdir, "header") 386 f = open(header_file_tmp, "w") 387 vcf.Writer(f, db_header_from_columns) 388 f.close() 389 390 # Replace #CHROM line with rel columns 391 header_list = db_for_header.read_header_file( 392 header_file=header_file_tmp 393 ) 394 header_list[-1] = "\t".join(db_header_columns) 395 396 except: 397 398 log.warning( 399 f"No header for file {input_file}. Set as default VCF header" 400 ) 401 header_list = default_header_list 402 403 else: # try for unknown format ? 404 405 log.error(f"Input file format '{input_format}' not available") 406 raise ValueError(f"Input file format '{input_format}' not available") 407 408 if not header_list: 409 header_list = default_header_list 410 411 # header as list 412 self.header_list = header_list 413 414 # header as VCF object 415 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 416 417 else: 418 419 self.header_list = None 420 self.header_vcf = None 421 422 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 423 """ 424 > The function `get_query_to_df` takes a query as a string and returns a pandas dataframe 425 426 :param query: str = "" 427 :type query: str 428 :return: A dataframe 429 """ 430 431 # Connexion format 432 connexion_format = self.get_connexion_format() 433 434 # Limit in query 435 if limit: 436 pd.set_option("display.max_rows", limit) 437 if connexion_format in ["duckdb"]: 438 df = ( 439 self.conn.execute(query) 440 .fetch_record_batch(limit) 441 .read_next_batch() 442 .to_pandas() 443 ) 444 elif connexion_format in ["sqlite"]: 445 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 446 447 # Full query 448 else: 449 if connexion_format in ["duckdb"]: 450 df = self.conn.execute(query).df() 451 elif connexion_format in ["sqlite"]: 452 df = pd.read_sql_query(query, self.conn) 453 454 return df 455 456 def get_overview(self) -> None: 457 """ 458 The function prints the input, output, config, and dataframe of the current object 459 """ 460 table_variants_from = self.get_table_variants(clause="from") 461 sql_columns = self.get_header_columns_as_sql() 462 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 463 df = self.get_query_to_df(sql_query_export) 464 log.info( 465 "Input: " 466 + str(self.get_input()) 467 + " [" 468 + str(str(self.get_input_format())) 469 + "]" 470 ) 471 log.info( 472 "Output: " 473 + str(self.get_output()) 474 + " [" 475 + str(str(self.get_output_format())) 476 + "]" 477 ) 478 log.info("Config: ") 479 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 480 "\n" 481 ): 482 log.info("\t" + str(d)) 483 log.info("Param: ") 484 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 485 "\n" 486 ): 487 log.info("\t" + str(d)) 488 log.info("Sample list: " + str(self.get_header_sample_list())) 489 log.info("Dataframe: ") 490 for d in str(df).split("\n"): 491 log.info("\t" + str(d)) 492 493 # garbage collector 494 del df 495 gc.collect() 496 497 return None 498 499 def get_stats(self) -> dict: 500 """ 501 The `get_stats` function calculates and returns various statistics of the current object, 502 including information about the input file, variants, samples, header fields, quality, and 503 SNVs/InDels. 504 :return: a dictionary containing various statistics of the current object. The dictionary has 505 the following structure: 506 """ 507 508 # Log 509 log.info(f"Stats Calculation...") 510 511 # table varaints 512 table_variants_from = self.get_table_variants() 513 514 # stats dict 515 stats = {"Infos": {}} 516 517 ### File 518 input_file = self.get_input() 519 stats["Infos"]["Input file"] = input_file 520 521 # Header 522 header_infos = self.get_header().infos 523 header_formats = self.get_header().formats 524 header_infos_list = list(header_infos) 525 header_formats_list = list(header_formats) 526 527 ### Variants 528 529 stats["Variants"] = {} 530 531 # Variants by chr 532 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 533 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 534 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 535 by=["CHROM"], kind="quicksort" 536 ) 537 538 # Total number of variants 539 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 540 541 # Calculate percentage 542 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 543 lambda x: (x / nb_of_variants) 544 ) 545 546 stats["Variants"]["Number of variants by chromosome"] = ( 547 nb_of_variants_by_chrom.to_dict(orient="index") 548 ) 549 550 stats["Infos"]["Number of variants"] = int(nb_of_variants) 551 552 ### Samples 553 554 # Init 555 samples = {} 556 nb_of_samples = 0 557 558 # Check Samples 559 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 560 log.debug(f"Check samples...") 561 for sample in self.get_header_sample_list(): 562 sql_query_samples = f""" 563 SELECT '{sample}' as sample, 564 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 565 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 566 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 567 FROM {table_variants_from} 568 WHERE ( 569 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 570 AND 571 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 572 ) 573 GROUP BY genotype 574 """ 575 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 576 sample_genotype_count = sql_query_genotype_df["count"].sum() 577 if len(sql_query_genotype_df): 578 nb_of_samples += 1 579 samples[f"{sample} - {sample_genotype_count} variants"] = ( 580 sql_query_genotype_df.to_dict(orient="index") 581 ) 582 583 stats["Samples"] = samples 584 stats["Infos"]["Number of samples"] = nb_of_samples 585 586 # # 587 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 588 # stats["Infos"]["Number of samples"] = nb_of_samples 589 # elif nb_of_samples: 590 # stats["Infos"]["Number of samples"] = "not a VCF format" 591 592 ### INFO and FORMAT fields 593 header_types_df = {} 594 header_types_list = { 595 "List of INFO fields": header_infos, 596 "List of FORMAT fields": header_formats, 597 } 598 i = 0 599 for header_type in header_types_list: 600 601 header_type_infos = header_types_list.get(header_type) 602 header_infos_dict = {} 603 604 for info in header_type_infos: 605 606 i += 1 607 header_infos_dict[i] = {} 608 609 # ID 610 header_infos_dict[i]["id"] = info 611 612 # num 613 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 614 if header_type_infos[info].num in genotype_map.keys(): 615 header_infos_dict[i]["Number"] = genotype_map.get( 616 header_type_infos[info].num 617 ) 618 else: 619 header_infos_dict[i]["Number"] = header_type_infos[info].num 620 621 # type 622 if header_type_infos[info].type: 623 header_infos_dict[i]["Type"] = header_type_infos[info].type 624 else: 625 header_infos_dict[i]["Type"] = "." 626 627 # desc 628 if header_type_infos[info].desc != None: 629 header_infos_dict[i]["Description"] = header_type_infos[info].desc 630 else: 631 header_infos_dict[i]["Description"] = "" 632 633 if len(header_infos_dict): 634 header_types_df[header_type] = pd.DataFrame.from_dict( 635 header_infos_dict, orient="index" 636 ).to_dict(orient="index") 637 638 # Stats 639 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 640 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 641 stats["Header"] = header_types_df 642 643 ### QUAL 644 if "QUAL" in self.get_header_columns(): 645 sql_query_qual = f""" 646 SELECT 647 avg(CAST(QUAL AS INTEGER)) AS Average, 648 min(CAST(QUAL AS INTEGER)) AS Minimum, 649 max(CAST(QUAL AS INTEGER)) AS Maximum, 650 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 651 median(CAST(QUAL AS INTEGER)) AS Median, 652 variance(CAST(QUAL AS INTEGER)) AS Variance 653 FROM {table_variants_from} 654 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 655 """ 656 657 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 658 stats["Quality"] = {"Stats": qual} 659 660 ### SNV and InDel 661 662 sql_query_snv = f""" 663 664 SELECT Type, count FROM ( 665 666 SELECT 667 'Total' AS Type, 668 count(*) AS count 669 FROM {table_variants_from} 670 671 UNION 672 673 SELECT 674 'MNV' AS Type, 675 count(*) AS count 676 FROM {table_variants_from} 677 WHERE len(REF) > 1 AND len(ALT) > 1 678 AND len(REF) = len(ALT) 679 680 UNION 681 682 SELECT 683 'InDel' AS Type, 684 count(*) AS count 685 FROM {table_variants_from} 686 WHERE len(REF) > 1 OR len(ALT) > 1 687 AND len(REF) != len(ALT) 688 689 UNION 690 691 SELECT 692 'SNV' AS Type, 693 count(*) AS count 694 FROM {table_variants_from} 695 WHERE len(REF) = 1 AND len(ALT) = 1 696 697 ) 698 699 ORDER BY count DESC 700 701 """ 702 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 703 704 sql_query_snv_substitution = f""" 705 SELECT 706 concat(REF, '>', ALT) AS 'Substitution', 707 count(*) AS count 708 FROM {table_variants_from} 709 WHERE len(REF) = 1 AND len(ALT) = 1 710 GROUP BY REF, ALT 711 ORDER BY count(*) DESC 712 """ 713 snv_substitution = ( 714 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 715 ) 716 stats["Variants"]["Counts"] = snv_indel 717 stats["Variants"]["Substitutions"] = snv_substitution 718 719 return stats 720 721 def stats_to_file(self, file: str = None) -> str: 722 """ 723 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 724 into a JSON object, and writes the JSON object to the specified file. 725 726 :param file: The `file` parameter is a string that represents the file path where the JSON data 727 will be written 728 :type file: str 729 :return: the name of the file that was written to. 730 """ 731 732 # Get stats 733 stats = self.get_stats() 734 735 # Serializing json 736 json_object = json.dumps(stats, indent=4) 737 738 # Writing to sample.json 739 with open(file, "w") as outfile: 740 outfile.write(json_object) 741 742 return file 743 744 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 745 """ 746 The `print_stats` function generates a markdown file and prints the statistics contained in a 747 JSON file in a formatted manner. 748 749 :param output_file: The `output_file` parameter is a string that specifies the path and filename 750 of the output file where the stats will be printed in Markdown format. If no `output_file` is 751 provided, a temporary directory will be created and the stats will be saved in a file named 752 "stats.md" within that 753 :type output_file: str 754 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 755 file where the statistics will be saved. If no value is provided, a temporary directory will be 756 created and a default file name "stats.json" will be used 757 :type json_file: str 758 :return: The function `print_stats` does not return any value. It has a return type annotation 759 of `None`. 760 """ 761 762 # Full path 763 output_file = full_path(output_file) 764 json_file = full_path(json_file) 765 766 with tempfile.TemporaryDirectory() as tmpdir: 767 768 # Files 769 if not output_file: 770 output_file = os.path.join(tmpdir, "stats.md") 771 if not json_file: 772 json_file = os.path.join(tmpdir, "stats.json") 773 774 # Create folders 775 if not os.path.exists(os.path.dirname(output_file)): 776 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 777 if not os.path.exists(os.path.dirname(json_file)): 778 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 779 780 # Create stats JSON file 781 stats_file = self.stats_to_file(file=json_file) 782 783 # Print stats file 784 with open(stats_file) as f: 785 stats = yaml.safe_load(f) 786 787 # Output 788 output_title = [] 789 output_index = [] 790 output = [] 791 792 # Title 793 output_title.append("# HOWARD Stats") 794 795 # Index 796 output_index.append("## Index") 797 798 # Process sections 799 for section in stats: 800 infos = stats.get(section) 801 section_link = "#" + section.lower().replace(" ", "-") 802 output.append(f"## {section}") 803 output_index.append(f"- [{section}]({section_link})") 804 805 if len(infos): 806 for info in infos: 807 try: 808 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 809 is_df = True 810 except: 811 try: 812 df = pd.DataFrame.from_dict( 813 json.loads((infos.get(info))), orient="index" 814 ) 815 is_df = True 816 except: 817 is_df = False 818 if is_df: 819 output.append(f"### {info}") 820 info_link = "#" + info.lower().replace(" ", "-") 821 output_index.append(f" - [{info}]({info_link})") 822 output.append(f"{df.to_markdown(index=False)}") 823 else: 824 output.append(f"- {info}: {infos.get(info)}") 825 else: 826 output.append(f"NA") 827 828 # Write stats in markdown file 829 with open(output_file, "w") as fp: 830 for item in output_title: 831 fp.write("%s\n" % item) 832 for item in output_index: 833 fp.write("%s\n" % item) 834 for item in output: 835 fp.write("%s\n" % item) 836 837 # Output stats in markdown 838 print("") 839 print("\n\n".join(output_title)) 840 print("") 841 print("\n\n".join(output)) 842 print("") 843 844 return None 845 846 def get_input(self) -> str: 847 """ 848 It returns the value of the input variable. 849 :return: The input is being returned. 850 """ 851 return self.input 852 853 def get_input_format(self, input_file: str = None) -> str: 854 """ 855 It returns the format of the input variable. 856 :return: The format is being returned. 857 """ 858 if not input_file: 859 input_file = self.get_input() 860 input_format = get_file_format(input_file) 861 return input_format 862 863 def get_input_compressed(self, input_file: str = None) -> str: 864 """ 865 It returns the format of the input variable. 866 :return: The format is being returned. 867 """ 868 if not input_file: 869 input_file = self.get_input() 870 input_compressed = get_file_compressed(input_file) 871 return input_compressed 872 873 def get_output(self) -> str: 874 """ 875 It returns the output of the neuron. 876 :return: The output of the neural network. 877 """ 878 return self.output 879 880 def get_output_format(self, output_file: str = None) -> str: 881 """ 882 It returns the format of the input variable. 883 :return: The format is being returned. 884 """ 885 if not output_file: 886 output_file = self.get_output() 887 output_format = get_file_format(output_file) 888 889 return output_format 890 891 def get_config(self) -> dict: 892 """ 893 It returns the config 894 :return: The config variable is being returned. 895 """ 896 return self.config 897 898 def get_param(self) -> dict: 899 """ 900 It returns the param 901 :return: The param variable is being returned. 902 """ 903 return self.param 904 905 def get_connexion_db(self) -> str: 906 """ 907 It returns the connexion_db attribute of the object 908 :return: The connexion_db is being returned. 909 """ 910 return self.connexion_db 911 912 def get_prefix(self) -> str: 913 """ 914 It returns the prefix of the object. 915 :return: The prefix is being returned. 916 """ 917 return self.prefix 918 919 def get_table_variants(self, clause: str = "select") -> str: 920 """ 921 This function returns the table_variants attribute of the object 922 923 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 924 defaults to select (optional) 925 :return: The table_variants attribute of the object. 926 """ 927 928 # Access 929 access = self.get_config().get("access", None) 930 931 # Clauses "select", "where", "update" 932 if clause in ["select", "where", "update"]: 933 table_variants = self.table_variants 934 # Clause "from" 935 elif clause in ["from"]: 936 # For Read Only 937 if self.get_input_format() in ["parquet"] and access in ["RO"]: 938 input_file = self.get_input() 939 table_variants = f"'{input_file}' as variants" 940 # For Read Write 941 else: 942 table_variants = f"{self.table_variants} as variants" 943 else: 944 table_variants = self.table_variants 945 return table_variants 946 947 def get_tmp_dir(self) -> str: 948 """ 949 The function `get_tmp_dir` returns the temporary directory path based on configuration 950 parameters or a default path. 951 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 952 configuration, parameters, and a default value of "/tmp". 953 """ 954 955 return get_tmp( 956 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 957 ) 958 959 def get_connexion_type(self) -> str: 960 """ 961 If the connexion type is not in the list of allowed connexion types, raise a ValueError 962 963 :return: The connexion type is being returned. 964 """ 965 return self.get_config().get("connexion_type", "memory") 966 967 def get_connexion(self): 968 """ 969 It returns the connection object 970 971 :return: The connection object. 972 """ 973 return self.conn 974 975 def close_connexion(self) -> None: 976 """ 977 This function closes the connection to the database. 978 :return: The connection is being closed. 979 """ 980 return self.conn.close() 981 982 def get_header(self, type: str = "vcf"): 983 """ 984 This function returns the header of the VCF file as a list of strings 985 986 :param type: the type of header you want to get, defaults to vcf (optional) 987 :return: The header of the vcf file. 988 """ 989 990 if self.header_vcf: 991 if type == "vcf": 992 return self.header_vcf 993 elif type == "list": 994 return self.header_list 995 else: 996 if type == "vcf": 997 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 998 return header 999 elif type == "list": 1000 return vcf_required 1001 1002 def get_header_length(self, file: str = None) -> int: 1003 """ 1004 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1005 line. 1006 1007 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1008 header file. If this argument is provided, the function will read the header from the specified 1009 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1010 :type file: str 1011 :return: the length of the header list, excluding the #CHROM line. 1012 """ 1013 1014 if file: 1015 return len(self.read_vcf_header_file(file=file)) - 1 1016 elif self.get_header(type="list"): 1017 return len(self.get_header(type="list")) - 1 1018 else: 1019 return 0 1020 1021 def get_header_columns(self) -> str: 1022 """ 1023 This function returns the header list of a VCF 1024 1025 :return: The length of the header list. 1026 """ 1027 if self.get_header(): 1028 return self.get_header(type="list")[-1] 1029 else: 1030 return "" 1031 1032 def get_header_columns_as_list(self) -> list: 1033 """ 1034 This function returns the header list of a VCF 1035 1036 :return: The length of the header list. 1037 """ 1038 if self.get_header(): 1039 return self.get_header_columns().strip().split("\t") 1040 else: 1041 return [] 1042 1043 def get_header_columns_as_sql(self) -> str: 1044 """ 1045 This function retruns header length (without #CHROM line) 1046 1047 :return: The length of the header list. 1048 """ 1049 sql_column_list = [] 1050 for col in self.get_header_columns_as_list(): 1051 sql_column_list.append(f'"{col}"') 1052 return ",".join(sql_column_list) 1053 1054 def get_header_sample_list(self) -> list: 1055 """ 1056 This function retruns header length (without #CHROM line) 1057 1058 :return: The length of the header list. 1059 """ 1060 return self.header_vcf.samples 1061 1062 def get_verbose(self) -> bool: 1063 """ 1064 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1065 exist 1066 1067 :return: The value of the key "verbose" in the config dictionary. 1068 """ 1069 return self.get_config().get("verbose", False) 1070 1071 def get_connexion_format(self) -> str: 1072 """ 1073 It returns the connexion format of the object. 1074 :return: The connexion_format is being returned. 1075 """ 1076 connexion_format = self.connexion_format 1077 if connexion_format not in ["duckdb", "sqlite"]: 1078 log.error(f"Unknown connexion format {connexion_format}") 1079 raise ValueError(f"Unknown connexion format {connexion_format}") 1080 else: 1081 return connexion_format 1082 1083 def insert_file_to_table( 1084 self, 1085 file, 1086 columns: str, 1087 header_len: int = 0, 1088 sep: str = "\t", 1089 chunksize: int = 1000000, 1090 ) -> None: 1091 """ 1092 The function reads a file in chunks, and inserts each chunk into a table 1093 1094 :param file: the file to be loaded 1095 :param columns: a string of the column names separated by commas 1096 :param header_len: the number of lines to skip at the beginning of the file, defaults to 0 1097 (optional) 1098 :param sep: the separator used in the file, defaults to \t (optional) 1099 :param chunksize: The number of rows to read in at a time, defaults to 1000000 (optional) 1100 """ 1101 1102 # Config 1103 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1104 connexion_format = self.get_connexion_format() 1105 1106 log.debug("chunksize: " + str(chunksize)) 1107 1108 if chunksize: 1109 for chunk in pd.read_csv( 1110 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1111 ): 1112 if connexion_format in ["duckdb"]: 1113 sql_insert_into = ( 1114 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1115 ) 1116 self.conn.execute(sql_insert_into) 1117 elif connexion_format in ["sqlite"]: 1118 chunk.to_sql("variants", self.conn, if_exists="append", index=False) 1119 1120 def load_data( 1121 self, 1122 input_file: str = None, 1123 drop_variants_table: bool = False, 1124 sample_size: int = 20480, 1125 ) -> None: 1126 """ 1127 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1128 table before loading the data and specify a sample size. 1129 1130 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1131 table 1132 :type input_file: str 1133 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1134 determines whether the variants table should be dropped before loading the data. If set to 1135 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1136 not be dropped, defaults to False 1137 :type drop_variants_table: bool (optional) 1138 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1139 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1140 20480 1141 :type sample_size: int (optional) 1142 """ 1143 1144 log.info("Loading...") 1145 1146 # change input file 1147 if input_file: 1148 self.set_input(input_file) 1149 self.set_header() 1150 1151 # drop variants table 1152 if drop_variants_table: 1153 self.drop_variants_table() 1154 1155 # get table variants 1156 table_variants = self.get_table_variants() 1157 1158 # Access 1159 access = self.get_config().get("access", None) 1160 log.debug(f"access: {access}") 1161 1162 # Input format and compress 1163 input_format = self.get_input_format() 1164 input_compressed = self.get_input_compressed() 1165 log.debug(f"input_format: {input_format}") 1166 log.debug(f"input_compressed: {input_compressed}") 1167 1168 # input_compressed_format 1169 if input_compressed: 1170 input_compressed_format = "gzip" 1171 else: 1172 input_compressed_format = "none" 1173 log.debug(f"input_compressed_format: {input_compressed_format}") 1174 1175 # Connexion format 1176 connexion_format = self.get_connexion_format() 1177 1178 # Sample size 1179 if not sample_size: 1180 sample_size = -1 1181 log.debug(f"sample_size: {sample_size}") 1182 1183 # Load data 1184 log.debug(f"Load Data from {input_format}") 1185 1186 # DuckDB connexion 1187 if connexion_format in ["duckdb"]: 1188 1189 # Database already exists 1190 if self.input_format in ["db", "duckdb"]: 1191 1192 if connexion_format in ["duckdb"]: 1193 log.debug(f"Input file format '{self.input_format}' duckDB") 1194 else: 1195 log.error( 1196 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1197 ) 1198 raise ValueError( 1199 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1200 ) 1201 1202 # Load from existing database format 1203 else: 1204 1205 try: 1206 # Create Table or View 1207 database = Database(database=self.input) 1208 sql_from = database.get_sql_from(sample_size=sample_size) 1209 1210 if access in ["RO"]: 1211 sql_load = ( 1212 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1213 ) 1214 else: 1215 sql_load = ( 1216 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1217 ) 1218 self.conn.execute(sql_load) 1219 1220 except: 1221 # Format not available 1222 log.error(f"Input file format '{self.input_format}' not available") 1223 raise ValueError( 1224 f"Input file format '{self.input_format}' not available" 1225 ) 1226 1227 # SQLite connexion 1228 elif connexion_format in ["sqlite"] and input_format in [ 1229 "vcf", 1230 "tsv", 1231 "csv", 1232 "psv", 1233 ]: 1234 1235 # Main structure 1236 structure = { 1237 "#CHROM": "VARCHAR", 1238 "POS": "INTEGER", 1239 "ID": "VARCHAR", 1240 "REF": "VARCHAR", 1241 "ALT": "VARCHAR", 1242 "QUAL": "VARCHAR", 1243 "FILTER": "VARCHAR", 1244 "INFO": "VARCHAR", 1245 } 1246 1247 # Strcuture with samples 1248 structure_complete = structure 1249 if self.get_header_sample_list(): 1250 structure["FORMAT"] = "VARCHAR" 1251 for sample in self.get_header_sample_list(): 1252 structure_complete[sample] = "VARCHAR" 1253 1254 # Columns list for create and insert 1255 sql_create_table_columns = [] 1256 sql_create_table_columns_list = [] 1257 for column in structure_complete: 1258 column_type = structure_complete[column] 1259 sql_create_table_columns.append( 1260 f'"{column}" {column_type} default NULL' 1261 ) 1262 sql_create_table_columns_list.append(f'"{column}"') 1263 1264 # Create database 1265 log.debug(f"Create Table {table_variants}") 1266 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1267 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1268 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1269 self.conn.execute(sql_create_table) 1270 1271 # chunksize define length of file chunk load file 1272 chunksize = 100000 1273 1274 # delimiter 1275 delimiter = file_format_delimiters.get(input_format, "\t") 1276 1277 # Load the input file 1278 with open(self.input, "rt") as input_file: 1279 1280 # Use the appropriate file handler based on the input format 1281 if input_compressed: 1282 input_file = bgzf.open(self.input, "rt") 1283 if input_format in ["vcf"]: 1284 header_len = self.get_header_length() 1285 else: 1286 header_len = 0 1287 1288 # Insert the file contents into a table 1289 self.insert_file_to_table( 1290 input_file, 1291 columns=sql_create_table_columns_list_sql, 1292 header_len=header_len, 1293 sep=delimiter, 1294 chunksize=chunksize, 1295 ) 1296 1297 else: 1298 log.error( 1299 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1300 ) 1301 raise ValueError( 1302 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1303 ) 1304 1305 # Explode INFOS fields into table fields 1306 if self.get_explode_infos(): 1307 self.explode_infos( 1308 prefix=self.get_explode_infos_prefix(), 1309 fields=self.get_explode_infos_fields(), 1310 force=True, 1311 ) 1312 1313 # Create index after insertion 1314 self.create_indexes() 1315 1316 def get_explode_infos(self) -> bool: 1317 """ 1318 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1319 to False if it is not set. 1320 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1321 value. If the parameter is not present, it will return False. 1322 """ 1323 1324 return self.get_param().get("explode", {}).get("explode_infos", False) 1325 1326 def get_explode_infos_fields( 1327 self, 1328 explode_infos_fields: str = None, 1329 remove_fields_not_in_header: bool = False, 1330 ) -> list: 1331 """ 1332 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1333 the input parameter `explode_infos_fields`. 1334 1335 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1336 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1337 comma-separated list of field names to explode 1338 :type explode_infos_fields: str 1339 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1340 flag that determines whether to remove fields that are not present in the header. If it is set 1341 to `True`, any field that is not in the header will be excluded from the list of exploded 1342 information fields. If it is set to `, defaults to False 1343 :type remove_fields_not_in_header: bool (optional) 1344 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1345 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1346 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1347 Otherwise, it returns a list of exploded information fields after removing any spaces and 1348 splitting the string by commas. 1349 """ 1350 1351 # If no fields, get it in param 1352 if not explode_infos_fields: 1353 explode_infos_fields = ( 1354 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1355 ) 1356 1357 # If no fields, defined as all fields in header using keyword 1358 if not explode_infos_fields: 1359 explode_infos_fields = "*" 1360 1361 # If fields list not empty 1362 if explode_infos_fields: 1363 1364 # Input fields list 1365 if isinstance(explode_infos_fields, str): 1366 fields_input = explode_infos_fields.split(",") 1367 elif isinstance(explode_infos_fields, list): 1368 fields_input = explode_infos_fields 1369 else: 1370 fields_input = [] 1371 1372 # Fields list without * keyword 1373 fields_without_all = fields_input.copy() 1374 if "*".casefold() in (item.casefold() for item in fields_without_all): 1375 fields_without_all.remove("*") 1376 1377 # Fields in header 1378 fields_in_header = sorted(list(set(self.get_header().infos))) 1379 1380 # Construct list of fields 1381 fields_output = [] 1382 for field in fields_input: 1383 1384 # Strip field 1385 field = field.strip() 1386 1387 # format keyword * in regex 1388 if field.upper() in ["*"]: 1389 field = ".*" 1390 1391 # Find all fields with pattern 1392 r = re.compile(field) 1393 fields_search = sorted(list(filter(r.match, fields_in_header))) 1394 1395 # Remove fields input from search 1396 if fields_search != [field]: 1397 fields_search = sorted( 1398 list(set(fields_search).difference(fields_input)) 1399 ) 1400 1401 # If field is not in header (avoid not well formatted header) 1402 if not fields_search and not remove_fields_not_in_header: 1403 fields_search = [field] 1404 1405 # Add found fields 1406 for new_field in fields_search: 1407 # Add field, if not already exists, and if it is in header (if asked) 1408 if ( 1409 new_field not in fields_output 1410 and ( 1411 not remove_fields_not_in_header 1412 or new_field in fields_in_header 1413 ) 1414 and new_field not in [".*"] 1415 ): 1416 fields_output.append(new_field) 1417 1418 return fields_output 1419 1420 else: 1421 1422 return [] 1423 1424 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1425 """ 1426 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1427 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1428 not provided. 1429 1430 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1431 prefix to be used for exploding or expanding information 1432 :type explode_infos_prefix: str 1433 :return: the value of the variable `explode_infos_prefix`. 1434 """ 1435 1436 if not explode_infos_prefix: 1437 explode_infos_prefix = ( 1438 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1439 ) 1440 1441 return explode_infos_prefix 1442 1443 def add_column( 1444 self, 1445 table_name, 1446 column_name, 1447 column_type, 1448 default_value=None, 1449 drop: bool = False, 1450 ) -> dict: 1451 """ 1452 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1453 doesn't already exist. 1454 1455 :param table_name: The name of the table to which you want to add a column 1456 :param column_name: The parameter "column_name" is the name of the column that you want to add 1457 to the table 1458 :param column_type: The `column_type` parameter specifies the data type of the column that you 1459 want to add to the table. It should be a string that represents the desired data type, such as 1460 "INTEGER", "TEXT", "REAL", etc 1461 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1462 default value for the newly added column. If a default value is provided, it will be assigned to 1463 the column for any existing rows that do not have a value for that column 1464 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1465 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1466 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1467 to False 1468 :type drop: bool (optional) 1469 :return: a boolean value indicating whether the column was successfully added to the table. 1470 """ 1471 1472 # added 1473 added = False 1474 dropped = False 1475 1476 # Check if the column already exists in the table 1477 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1478 columns = self.get_query_to_df(query).columns.tolist() 1479 if column_name in columns: 1480 log.debug( 1481 f"The {column_name} column already exists in the {table_name} table" 1482 ) 1483 if drop: 1484 self.drop_column(table_name=table_name, column_name=column_name) 1485 dropped = True 1486 else: 1487 return None 1488 else: 1489 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1490 1491 # Add column in table 1492 add_column_query = ( 1493 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1494 ) 1495 if default_value is not None: 1496 add_column_query += f" DEFAULT {default_value}" 1497 self.execute_query(add_column_query) 1498 added = not dropped 1499 log.debug( 1500 f"The {column_name} column was successfully added to the {table_name} table" 1501 ) 1502 1503 if added: 1504 added_column = { 1505 "table_name": table_name, 1506 "column_name": column_name, 1507 "column_type": column_type, 1508 "default_value": default_value, 1509 } 1510 else: 1511 added_column = None 1512 1513 return added_column 1514 1515 def drop_column( 1516 self, column: dict = None, table_name: str = None, column_name: str = None 1517 ) -> bool: 1518 """ 1519 The `drop_column` function drops a specified column from a given table in a database and returns 1520 True if the column was successfully dropped, and False if the column does not exist in the 1521 table. 1522 1523 :param column: The `column` parameter is a dictionary that contains information about the column 1524 you want to drop. It has two keys: 1525 :type column: dict 1526 :param table_name: The `table_name` parameter is the name of the table from which you want to 1527 drop a column 1528 :type table_name: str 1529 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1530 from the table 1531 :type column_name: str 1532 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1533 and False if the column does not exist in the table. 1534 """ 1535 1536 # Find column infos 1537 if column: 1538 if isinstance(column, dict): 1539 table_name = column.get("table_name", None) 1540 column_name = column.get("column_name", None) 1541 elif isinstance(column, str): 1542 table_name = self.get_table_variants() 1543 column_name = column 1544 else: 1545 table_name = None 1546 column_name = None 1547 1548 if not table_name and not column_name: 1549 return False 1550 1551 # Removed 1552 removed = False 1553 1554 # Check if the column already exists in the table 1555 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1556 columns = self.get_query_to_df(query).columns.tolist() 1557 if column_name in columns: 1558 log.debug(f"The {column_name} column exists in the {table_name} table") 1559 else: 1560 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1561 return False 1562 1563 # Add column in table # ALTER TABLE integers DROP k 1564 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1565 self.execute_query(add_column_query) 1566 removed = True 1567 log.debug( 1568 f"The {column_name} column was successfully dropped to the {table_name} table" 1569 ) 1570 1571 return removed 1572 1573 def explode_infos( 1574 self, 1575 prefix: str = None, 1576 create_index: bool = False, 1577 fields: list = None, 1578 force: bool = False, 1579 proccess_all_fields_together: bool = False, 1580 ) -> list: 1581 """ 1582 The `explode_infos` function takes a VCF file and explodes the INFO fields into individual 1583 columns, returning a list of added columns. 1584 1585 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1586 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1587 `self.get_explode_infos_prefix()` as the prefix 1588 :type prefix: str 1589 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1590 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1591 `False`, indexes will not be created. The default value is `False`, defaults to False 1592 :type create_index: bool (optional) 1593 :param fields: The `fields` parameter is a list of INFO fields that you want to explode into 1594 individual columns. If this parameter is not provided, all INFO fields will be exploded 1595 :type fields: list 1596 :param force: The `force` parameter is a boolean flag that determines whether to drop and 1597 recreate the column if it already exists in the table. If `force` is set to `True`, the column 1598 will be dropped and recreated. If `force` is set to `False`, the column will not be dropped, 1599 defaults to False 1600 :type force: bool (optional) 1601 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1602 flag that determines whether to process all the INFO fields together or individually. If set to 1603 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1604 be processed individually, defaults to False 1605 :type proccess_all_fields_together: bool (optional) 1606 :return: The function `explode_infos` returns a list of added columns. 1607 """ 1608 1609 # drop indexes 1610 self.drop_indexes() 1611 1612 # connexion format 1613 connexion_format = self.get_connexion_format() 1614 1615 # Access 1616 access = self.get_config().get("access", None) 1617 1618 # Added columns 1619 added_columns = [] 1620 1621 if access not in ["RO"]: 1622 1623 # prefix 1624 if prefix in [None, True] or not isinstance(prefix, str): 1625 if self.get_explode_infos_prefix() not in [None, True]: 1626 prefix = self.get_explode_infos_prefix() 1627 else: 1628 prefix = "INFO/" 1629 1630 # table variants 1631 table_variants = self.get_table_variants(clause="select") 1632 1633 # extra infos 1634 try: 1635 extra_infos = self.get_extra_infos() 1636 except: 1637 extra_infos = [] 1638 1639 # Header infos 1640 header_infos = self.get_header().infos 1641 1642 log.debug( 1643 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1644 ) 1645 1646 sql_info_alter_table_array = [] 1647 1648 # Info fields to check 1649 fields_list = list(header_infos) 1650 if fields: 1651 fields_list += fields 1652 fields_list = set(fields_list) 1653 1654 # If no fields 1655 if not fields: 1656 fields = [] 1657 1658 # Translate fields if patterns 1659 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1660 1661 for info in fields: 1662 1663 info_id_sql = prefix + info 1664 1665 if ( 1666 info in fields_list 1667 or prefix + info in fields_list 1668 or info in extra_infos 1669 ): 1670 1671 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1672 1673 if info in header_infos: 1674 info_type = header_infos[info].type 1675 info_num = header_infos[info].num 1676 else: 1677 info_type = "String" 1678 info_num = 0 1679 1680 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1681 if info_num != 1: 1682 type_sql = "VARCHAR" 1683 1684 # Add field 1685 added_column = self.add_column( 1686 table_name=table_variants, 1687 column_name=info_id_sql, 1688 column_type=type_sql, 1689 default_value="null", 1690 drop=force, 1691 ) 1692 1693 if added_column: 1694 added_columns.append(added_column) 1695 1696 if added_column or force: 1697 1698 # add field to index 1699 self.index_additionnal_fields.append(info_id_sql) 1700 1701 # Update field array 1702 if connexion_format in ["duckdb"]: 1703 update_info_field = f""" 1704 "{info_id_sql}" = 1705 CASE 1706 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1707 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1708 END 1709 """ 1710 elif connexion_format in ["sqlite"]: 1711 update_info_field = f""" 1712 "{info_id_sql}" = 1713 CASE 1714 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1715 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1716 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1717 END 1718 """ 1719 1720 sql_info_alter_table_array.append(update_info_field) 1721 1722 if sql_info_alter_table_array: 1723 1724 # By chromosomes 1725 try: 1726 chromosomes_list = list( 1727 self.get_query_to_df( 1728 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1729 )["#CHROM"] 1730 ) 1731 except: 1732 chromosomes_list = [None] 1733 1734 for chrom in chromosomes_list: 1735 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1736 1737 # Where clause 1738 where_clause = "" 1739 if chrom and len(chromosomes_list) > 1: 1740 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1741 1742 # Update table 1743 if proccess_all_fields_together: 1744 sql_info_alter_table_array_join = ", ".join( 1745 sql_info_alter_table_array 1746 ) 1747 if sql_info_alter_table_array_join: 1748 sql_info_alter_table = f""" 1749 UPDATE {table_variants} 1750 SET {sql_info_alter_table_array_join} 1751 {where_clause} 1752 """ 1753 log.debug( 1754 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1755 ) 1756 # log.debug(sql_info_alter_table) 1757 self.conn.execute(sql_info_alter_table) 1758 else: 1759 sql_info_alter_num = 0 1760 for sql_info_alter in sql_info_alter_table_array: 1761 sql_info_alter_num += 1 1762 sql_info_alter_table = f""" 1763 UPDATE {table_variants} 1764 SET {sql_info_alter} 1765 {where_clause} 1766 """ 1767 log.debug( 1768 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1769 ) 1770 # log.debug(sql_info_alter_table) 1771 self.conn.execute(sql_info_alter_table) 1772 1773 # create indexes 1774 if create_index: 1775 self.create_indexes() 1776 1777 return added_columns 1778 1779 def create_indexes(self) -> None: 1780 """ 1781 Create indexes on the table after insertion 1782 """ 1783 1784 # Access 1785 access = self.get_config().get("access", None) 1786 1787 # get table variants 1788 table_variants = self.get_table_variants("FROM") 1789 1790 if self.get_indexing() and access not in ["RO"]: 1791 # Create index 1792 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 1793 self.conn.execute(sql_create_table_index) 1794 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 1795 self.conn.execute(sql_create_table_index) 1796 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 1797 self.conn.execute(sql_create_table_index) 1798 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 1799 self.conn.execute(sql_create_table_index) 1800 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 1801 self.conn.execute(sql_create_table_index) 1802 for field in self.index_additionnal_fields: 1803 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 1804 self.conn.execute(sql_create_table_index) 1805 1806 def drop_indexes(self) -> None: 1807 """ 1808 Create indexes on the table after insertion 1809 """ 1810 1811 # Access 1812 access = self.get_config().get("access", None) 1813 1814 # get table variants 1815 table_variants = self.get_table_variants("FROM") 1816 1817 # Get database format 1818 connexion_format = self.get_connexion_format() 1819 1820 if access not in ["RO"]: 1821 if connexion_format in ["duckdb"]: 1822 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 1823 elif connexion_format in ["sqlite"]: 1824 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 1825 1826 list_indexes = self.conn.execute(sql_list_indexes) 1827 index_names = [row[0] for row in list_indexes.fetchall()] 1828 for index in index_names: 1829 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 1830 self.conn.execute(sql_drop_table_index) 1831 1832 def read_vcf_header(self, f) -> list: 1833 """ 1834 It reads the header of a VCF file and returns a list of the header lines 1835 1836 :param f: the file object 1837 :return: The header lines of the VCF file. 1838 """ 1839 1840 header_list = [] 1841 for line in f: 1842 header_list.append(line) 1843 if line.startswith("#CHROM"): 1844 break 1845 return header_list 1846 1847 def read_vcf_header_file(self, file: str = None) -> list: 1848 """ 1849 The function `read_vcf_header_file` reads the header of a VCF file, either from a compressed or 1850 uncompressed file. 1851 1852 :param file: The `file` parameter is a string that represents the path to the VCF header file 1853 that you want to read. It is an optional parameter, so if you don't provide a value, it will 1854 default to `None` 1855 :type file: str 1856 :param compressed: The `compressed` parameter is a boolean flag that indicates whether the VCF 1857 file is compressed or not. If `compressed` is set to `True`, it means that the VCF file is 1858 compressed using the BGZF compression format. If `compressed` is set to `False`, it means that, 1859 defaults to False 1860 :type compressed: bool (optional) 1861 :return: a list. 1862 """ 1863 1864 if self.get_input_compressed(input_file=file): 1865 with bgzf.open(file, "rt") as f: 1866 return self.read_vcf_header(f=f) 1867 else: 1868 with open(file, "rt") as f: 1869 return self.read_vcf_header(f=f) 1870 1871 def execute_query(self, query: str): 1872 """ 1873 It takes a query as an argument, executes it, and returns the results 1874 1875 :param query: The query to be executed 1876 :return: The result of the query is being returned. 1877 """ 1878 if query: 1879 return self.conn.execute(query) # .fetchall() 1880 else: 1881 return None 1882 1883 def export_output( 1884 self, 1885 output_file: str | None = None, 1886 output_header: str | None = None, 1887 export_header: bool = True, 1888 query: str | None = None, 1889 parquet_partitions: list | None = None, 1890 chunk_size: int | None = None, 1891 threads: int | None = None, 1892 sort: bool = False, 1893 index: bool = False, 1894 order_by: str | None = None, 1895 ) -> bool: 1896 """ 1897 The `export_output` function exports data from a VCF file to a specified output file in various 1898 formats, including VCF, CSV, TSV, PSV, and Parquet. 1899 1900 :param output_file: The `output_file` parameter is a string that specifies the name of the 1901 output file to be generated by the function. This is where the exported data will be saved 1902 :type output_file: str 1903 :param output_header: The `output_header` parameter is a string that specifies the name of the 1904 file where the header of the VCF file will be exported. If this parameter is not provided, the 1905 header will be exported to a file with the same name as the `output_file` parameter, but with 1906 the extension " 1907 :type output_header: str 1908 :param export_header: The `export_header` parameter is a boolean flag that determines whether 1909 the header of a VCF file should be exported to a separate file or not. If `export_header` is 1910 True, the header will be exported to a file. If `export_header` is False, the header will not 1911 be, defaults to True, if output format is not VCF 1912 :type export_header: bool (optional) 1913 :param query: The `query` parameter is an optional SQL query that can be used to filter and 1914 select specific data from the VCF file before exporting it. If provided, only the data that 1915 matches the query will be exported 1916 :type query: str 1917 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 1918 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 1919 organize data in a hierarchical directory structure based on the values of one or more columns. 1920 This can improve query performance when working with large datasets 1921 :type parquet_partitions: list 1922 :param chunk_size: The `chunk_size` parameter specifies the number of 1923 records in batch when exporting data in Parquet format. This parameter is used for 1924 partitioning the Parquet file into multiple files. 1925 :type chunk_size: int 1926 :param threads: The `threads` parameter is an optional parameter that specifies the number of 1927 threads to be used during the export process. It determines the level of parallelism and can 1928 improve the performance of the export operation. If not provided, the function will use the 1929 default number of threads 1930 :type threads: int 1931 :param sort: The `sort` parameter is a boolean flag that determines whether the output file 1932 should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the 1933 genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to 1934 False 1935 :type sort: bool (optional) 1936 :param index: The `index` parameter is a boolean flag that determines whether an index should be 1937 created on the output file. If `index` is True, an index will be created. If `index` is False, 1938 no index will be created. The default value is False, defaults to False 1939 :type index: bool (optional) 1940 :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for 1941 sorting the output file. This parameter is only applicable when exporting data in VCF format 1942 :type order_by: str 1943 :return: a boolean value. It checks if the output file exists and returns True if it does, or 1944 None if it doesn't. 1945 """ 1946 1947 # Log 1948 log.info("Exporting...") 1949 1950 # Full path 1951 output_file = full_path(output_file) 1952 output_header = full_path(output_header) 1953 1954 # Config 1955 config = self.get_config() 1956 1957 # Param 1958 param = self.get_param() 1959 1960 # Tmp files to remove 1961 tmp_to_remove = [] 1962 1963 # If no output, get it 1964 if not output_file: 1965 output_file = self.get_output() 1966 1967 # If not threads 1968 if not threads: 1969 threads = self.get_threads() 1970 1971 # Auto header name with extension 1972 if export_header or output_header: 1973 if not output_header: 1974 output_header = f"{output_file}.hdr" 1975 # Export header 1976 self.export_header(output_file=output_file) 1977 1978 # Switch off export header if VCF output 1979 output_file_type = get_file_format(output_file) 1980 if output_file_type in ["vcf"]: 1981 export_header = False 1982 tmp_to_remove.append(output_header) 1983 1984 # Chunk size 1985 if not chunk_size: 1986 chunk_size = config.get("chunk_size", None) 1987 1988 # Parquet partition 1989 if not parquet_partitions: 1990 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 1991 if parquet_partitions and isinstance(parquet_partitions, str): 1992 parquet_partitions = parquet_partitions.split(",") 1993 1994 # Order by 1995 if not order_by: 1996 order_by = param.get("export", {}).get("order_by", "") 1997 1998 # Header in output 1999 header_in_output = param.get("export", {}).get("include_header", False) 2000 2001 # Database 2002 database_source = self.get_connexion() 2003 2004 # Connexion format 2005 connexion_format = self.get_connexion_format() 2006 2007 # Explode infos 2008 if self.get_explode_infos(): 2009 self.explode_infos( 2010 prefix=self.get_explode_infos_prefix(), 2011 fields=self.get_explode_infos_fields(), 2012 force=False, 2013 ) 2014 2015 # if connexion_format in ["sqlite"] or query: 2016 if connexion_format in ["sqlite"]: 2017 2018 # Export in Parquet 2019 random_tmp = "".join( 2020 random.choice(string.ascii_lowercase) for i in range(10) 2021 ) 2022 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2023 tmp_to_remove.append(database_source) 2024 2025 # Table Variants 2026 table_variants = self.get_table_variants() 2027 2028 # Create export query 2029 sql_query_export_subquery = f""" 2030 SELECT * FROM {table_variants} 2031 """ 2032 2033 # Write source file 2034 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2035 2036 # Create database 2037 database = Database( 2038 database=database_source, 2039 table="variants", 2040 header_file=output_header, 2041 conn_config=self.get_connexion_config(), 2042 ) 2043 2044 # Existing colomns header 2045 # existing_columns_header = database.get_header_file_columns(output_header) 2046 existing_columns_header = database.get_header_columns_from_database() 2047 2048 # Export file 2049 database.export( 2050 output_database=output_file, 2051 output_header=output_header, 2052 existing_columns_header=existing_columns_header, 2053 parquet_partitions=parquet_partitions, 2054 chunk_size=chunk_size, 2055 threads=threads, 2056 sort=sort, 2057 index=index, 2058 header_in_output=header_in_output, 2059 order_by=order_by, 2060 query=query, 2061 export_header=export_header, 2062 ) 2063 2064 # Remove 2065 remove_if_exists(tmp_to_remove) 2066 2067 return (os.path.exists(output_file) or None) and ( 2068 os.path.exists(output_file) or None 2069 ) 2070 2071 def get_extra_infos(self, table: str = None) -> list: 2072 """ 2073 > This function returns a list of columns that are in the table but not in the header 2074 2075 The function is called `get_extra_infos` and it takes two arguments: `self` and `table`. The 2076 `self` argument is a reference to the object that called the function. The `table` argument is 2077 the name of the table that we want to get the extra columns from 2078 2079 :param table: The table to get the extra columns from. If not specified, it will use the 2080 variants table 2081 :param format: The format of the output. If it's "sql", it will return a string of the extra 2082 columns separated by commas. If it's "list", it will return a list of the extra columns 2083 :return: A list of columns that are in the table but not in the header 2084 """ 2085 2086 header_columns = [] 2087 2088 if not table: 2089 table = self.get_table_variants(clause="from") 2090 header_columns = self.get_header_columns() 2091 2092 # Check all columns in the database 2093 query = f""" SELECT * FROM {table} LIMIT 1 """ 2094 log.debug(f"query {query}") 2095 table_columns = self.get_query_to_df(query).columns.tolist() 2096 extra_columns = [] 2097 2098 # Construct extra infos (not in header) 2099 for column in table_columns: 2100 if column not in header_columns: 2101 extra_columns.append(column) 2102 2103 return extra_columns 2104 2105 def get_extra_infos_sql(self, table: str = None) -> str: 2106 """ 2107 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2108 by double quotes 2109 2110 :param table: The name of the table to get the extra infos from. If None, the default table is 2111 used 2112 :type table: str 2113 :return: A string of the extra infos 2114 """ 2115 2116 return ", ".join( 2117 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2118 ) 2119 2120 def export_header( 2121 self, 2122 header_name: str = None, 2123 output_file: str = None, 2124 output_file_ext: str = ".hdr", 2125 clean_header: bool = True, 2126 remove_chrom_line: bool = False, 2127 ) -> str: 2128 """ 2129 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2130 specified options, and writes it to a new file. 2131 2132 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2133 this parameter is not specified, the header will be written to the output file 2134 :type header_name: str 2135 :param output_file: The `output_file` parameter in the `export_header` function is used to 2136 specify the name of the output file where the header will be written. If this parameter is not 2137 provided, the header will be written to a temporary file 2138 :type output_file: str 2139 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2140 string that represents the extension of the output header file. By default, it is set to ".hdr" 2141 if not specified by the user. This extension will be appended to the `output_file` name to 2142 create the final, defaults to .hdr 2143 :type output_file_ext: str (optional) 2144 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2145 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2146 `True`, the function will clean the header by modifying certain lines based on a specific 2147 pattern. If `clean_header`, defaults to True 2148 :type clean_header: bool (optional) 2149 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2150 boolean flag that determines whether the #CHROM line should be removed from the header before 2151 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2152 defaults to False 2153 :type remove_chrom_line: bool (optional) 2154 :return: The function `export_header` returns the name of the temporary header file that is 2155 created. 2156 """ 2157 2158 if not header_name and not output_file: 2159 output_file = self.get_output() 2160 2161 if self.get_header(): 2162 2163 # Get header object 2164 header_obj = self.get_header() 2165 2166 # Create database 2167 db_for_header = Database(database=self.get_input()) 2168 2169 # Get real columns in the file 2170 db_header_columns = db_for_header.get_columns() 2171 2172 with tempfile.TemporaryDirectory() as tmpdir: 2173 2174 # Write header file 2175 header_file_tmp = os.path.join(tmpdir, "header") 2176 f = open(header_file_tmp, "w") 2177 vcf.Writer(f, header_obj) 2178 f.close() 2179 2180 # Replace #CHROM line with rel columns 2181 header_list = db_for_header.read_header_file( 2182 header_file=header_file_tmp 2183 ) 2184 header_list[-1] = "\t".join(db_header_columns) 2185 2186 # Remove CHROM line 2187 if remove_chrom_line: 2188 header_list.pop() 2189 2190 # Clean header 2191 if clean_header: 2192 header_list_clean = [] 2193 for head in header_list: 2194 # Clean head for malformed header 2195 head_clean = head 2196 head_clean = re.subn( 2197 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2198 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2199 head_clean, 2200 2, 2201 )[0] 2202 # Write header 2203 header_list_clean.append(head_clean) 2204 header_list = header_list_clean 2205 2206 tmp_header_name = output_file + output_file_ext 2207 2208 f = open(tmp_header_name, "w") 2209 for line in header_list: 2210 f.write(line) 2211 f.close() 2212 2213 return tmp_header_name 2214 2215 def export_variant_vcf( 2216 self, 2217 vcf_file, 2218 remove_info: bool = False, 2219 add_samples: bool = True, 2220 list_samples: list = [], 2221 index: bool = False, 2222 threads: int | None = None, 2223 ) -> bool | None: 2224 """ 2225 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2226 remove INFO field, add samples, and control compression and indexing. 2227 2228 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2229 written to. It is the output file that will contain the filtered VCF data based on the specified 2230 parameters 2231 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2232 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2233 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2234 in, defaults to False 2235 :type remove_info: bool (optional) 2236 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2237 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2238 If set to False, the samples will be removed. The default value is True, defaults to True 2239 :type add_samples: bool (optional) 2240 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2241 in the output VCF file. By default, all samples will be included. If you provide a list of 2242 samples, only those samples will be included in the output file 2243 :type list_samples: list 2244 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2245 determines whether or not to create an index for the output VCF file. If `index` is set to 2246 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2247 :type index: bool (optional) 2248 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2249 number of threads to use for exporting the VCF file. It determines how many parallel threads 2250 will be used during the export process. More threads can potentially speed up the export process 2251 by utilizing multiple cores of the processor. If 2252 :type threads: int | None 2253 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2254 method with various parameters including the output file, query, threads, sort flag, and index 2255 flag. The `export_output` method is responsible for exporting the VCF data based on the 2256 specified parameters and configurations provided in the `export_variant_vcf` function. 2257 """ 2258 2259 # Config 2260 config = self.get_config() 2261 2262 # Extract VCF 2263 log.debug("Export VCF...") 2264 2265 # Table variants 2266 table_variants = self.get_table_variants() 2267 2268 # Threads 2269 if not threads: 2270 threads = self.get_threads() 2271 2272 # Info fields 2273 if remove_info: 2274 if not isinstance(remove_info, str): 2275 remove_info = "." 2276 info_field = f"""'{remove_info}' as INFO""" 2277 else: 2278 info_field = "INFO" 2279 2280 # Samples fields 2281 if add_samples: 2282 if not list_samples: 2283 list_samples = self.get_header_sample_list() 2284 if list_samples: 2285 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2286 else: 2287 samples_fields = "" 2288 log.debug(f"samples_fields: {samples_fields}") 2289 else: 2290 samples_fields = "" 2291 2292 # Variants 2293 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2294 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} """ 2295 2296 return self.export_output( 2297 output_file=vcf_file, 2298 output_header=None, 2299 export_header=True, 2300 query=sql_query_select, 2301 parquet_partitions=None, 2302 chunk_size=config.get("chunk_size", None), 2303 threads=threads, 2304 sort=True, 2305 index=index, 2306 order_by=None, 2307 ) 2308 2309 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2310 """ 2311 It takes a list of commands and runs them in parallel using the number of threads specified 2312 2313 :param commands: A list of commands to run 2314 :param threads: The number of threads to use, defaults to 1 (optional) 2315 """ 2316 2317 run_parallel_commands(commands, threads) 2318 2319 def get_threads(self, default: int = 1) -> int: 2320 """ 2321 This function returns the number of threads to use for a job, with a default value of 1 if not 2322 specified. 2323 2324 :param default: The `default` parameter in the `get_threads` method is used to specify the 2325 default number of threads to use if no specific value is provided. If no value is provided for 2326 the `threads` parameter in the configuration or input parameters, the `default` value will be 2327 used, defaults to 1 2328 :type default: int (optional) 2329 :return: the number of threads to use for the current job. 2330 """ 2331 2332 # Config 2333 config = self.get_config() 2334 2335 # Param 2336 param = self.get_param() 2337 2338 # Input threads 2339 input_thread = param.get("threads", config.get("threads", None)) 2340 2341 # Check threads 2342 if not input_thread: 2343 threads = default 2344 elif int(input_thread) <= 0: 2345 threads = os.cpu_count() 2346 else: 2347 threads = int(input_thread) 2348 return threads 2349 2350 def get_memory(self, default: str = None) -> str: 2351 """ 2352 This function retrieves the memory value from parameters or configuration with a default value 2353 if not found. 2354 2355 :param default: The `get_memory` function takes in a default value as a string parameter. This 2356 default value is used as a fallback in case the `memory` parameter is not provided in the 2357 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2358 the function 2359 :type default: str 2360 :return: The `get_memory` function returns a string value representing the memory parameter. If 2361 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2362 return the default value provided as an argument to the function. 2363 """ 2364 2365 # Config 2366 config = self.get_config() 2367 2368 # Param 2369 param = self.get_param() 2370 2371 # Input threads 2372 input_memory = param.get("memory", config.get("memory", None)) 2373 2374 # Check threads 2375 if input_memory: 2376 memory = input_memory 2377 else: 2378 memory = default 2379 2380 return memory 2381 2382 def update_from_vcf(self, vcf_file: str) -> None: 2383 """ 2384 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2385 2386 :param vcf_file: the path to the VCF file 2387 """ 2388 2389 connexion_format = self.get_connexion_format() 2390 2391 if connexion_format in ["duckdb"]: 2392 self.update_from_vcf_duckdb(vcf_file) 2393 elif connexion_format in ["sqlite"]: 2394 self.update_from_vcf_sqlite(vcf_file) 2395 2396 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2397 """ 2398 It takes a VCF file and updates the INFO column of the variants table in the database with the 2399 INFO column of the VCF file 2400 2401 :param vcf_file: the path to the VCF file 2402 """ 2403 2404 # varaints table 2405 table_variants = self.get_table_variants() 2406 2407 # Loading VCF into temporaire table 2408 skip = self.get_header_length(file=vcf_file) 2409 vcf_df = pd.read_csv( 2410 vcf_file, 2411 sep="\t", 2412 engine="c", 2413 skiprows=skip, 2414 header=0, 2415 low_memory=False, 2416 ) 2417 sql_query_update = f""" 2418 UPDATE {table_variants} as table_variants 2419 SET INFO = concat( 2420 CASE 2421 WHEN INFO NOT IN ('', '.') 2422 THEN INFO 2423 ELSE '' 2424 END, 2425 ( 2426 SELECT 2427 concat( 2428 CASE 2429 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2430 THEN ';' 2431 ELSE '' 2432 END 2433 , 2434 CASE 2435 WHEN table_parquet.INFO NOT IN ('','.') 2436 THEN table_parquet.INFO 2437 ELSE '' 2438 END 2439 ) 2440 FROM vcf_df as table_parquet 2441 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2442 AND table_parquet.\"POS\" = table_variants.\"POS\" 2443 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2444 AND table_parquet.\"REF\" = table_variants.\"REF\" 2445 AND table_parquet.INFO NOT IN ('','.') 2446 ) 2447 ) 2448 ; 2449 """ 2450 self.conn.execute(sql_query_update) 2451 2452 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2453 """ 2454 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2455 table, then updates the INFO column of the variants table with the INFO column of the temporary 2456 table 2457 2458 :param vcf_file: The path to the VCF file you want to update the database with 2459 """ 2460 2461 # Create a temporary table for the VCF 2462 table_vcf = "tmp_vcf" 2463 sql_create = ( 2464 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2465 ) 2466 self.conn.execute(sql_create) 2467 2468 # Loading VCF into temporaire table 2469 vcf_df = pd.read_csv( 2470 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2471 ) 2472 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2473 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2474 2475 # Update table 'variants' with VCF data 2476 # warning: CONCAT as || operator 2477 sql_query_update = f""" 2478 UPDATE variants as table_variants 2479 SET INFO = CASE 2480 WHEN INFO NOT IN ('', '.') 2481 THEN INFO 2482 ELSE '' 2483 END || 2484 ( 2485 SELECT 2486 CASE 2487 WHEN table_variants.INFO NOT IN ('','.') 2488 AND table_vcf.INFO NOT IN ('','.') 2489 THEN ';' 2490 ELSE '' 2491 END || 2492 CASE 2493 WHEN table_vcf.INFO NOT IN ('','.') 2494 THEN table_vcf.INFO 2495 ELSE '' 2496 END 2497 FROM {table_vcf} as table_vcf 2498 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2499 AND table_vcf.\"POS\" = table_variants.\"POS\" 2500 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2501 AND table_vcf.\"REF\" = table_variants.\"REF\" 2502 ) 2503 """ 2504 self.conn.execute(sql_query_update) 2505 2506 # Drop temporary table 2507 sql_drop = f"DROP TABLE {table_vcf}" 2508 self.conn.execute(sql_drop) 2509 2510 def drop_variants_table(self) -> None: 2511 """ 2512 > This function drops the variants table 2513 """ 2514 2515 table_variants = self.get_table_variants() 2516 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2517 self.conn.execute(sql_table_variants) 2518 2519 def set_variant_id( 2520 self, variant_id_column: str = "variant_id", force: bool = None 2521 ) -> str: 2522 """ 2523 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2524 `#CHROM`, `POS`, `REF`, and `ALT` columns 2525 2526 :param variant_id_column: The name of the column to be created in the variants table, defaults 2527 to variant_id 2528 :type variant_id_column: str (optional) 2529 :param force: If True, the variant_id column will be created even if it already exists 2530 :type force: bool 2531 :return: The name of the column that contains the variant_id 2532 """ 2533 2534 # Assembly 2535 assembly = self.get_param().get( 2536 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2537 ) 2538 2539 # INFO/Tag prefix 2540 prefix = self.get_explode_infos_prefix() 2541 2542 # Explode INFO/SVTYPE 2543 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2544 2545 # variants table 2546 table_variants = self.get_table_variants() 2547 2548 # variant_id column 2549 if not variant_id_column: 2550 variant_id_column = "variant_id" 2551 2552 # Creta variant_id column 2553 if "variant_id" not in self.get_extra_infos() or force: 2554 2555 # Create column 2556 self.add_column( 2557 table_name=table_variants, 2558 column_name=variant_id_column, 2559 column_type="UBIGINT", 2560 default_value="0", 2561 ) 2562 2563 # Update column 2564 self.conn.execute( 2565 f""" 2566 UPDATE {table_variants} 2567 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2568 """ 2569 ) 2570 2571 # Remove added columns 2572 for added_column in added_columns: 2573 self.drop_column(column=added_column) 2574 2575 # return variant_id column name 2576 return variant_id_column 2577 2578 def get_variant_id_column( 2579 self, variant_id_column: str = "variant_id", force: bool = None 2580 ) -> str: 2581 """ 2582 This function returns the variant_id column name 2583 2584 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2585 defaults to variant_id 2586 :type variant_id_column: str (optional) 2587 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2588 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2589 if it is not already set, or if it is set 2590 :type force: bool 2591 :return: The variant_id column name. 2592 """ 2593 2594 return self.set_variant_id(variant_id_column=variant_id_column, force=force) 2595 2596 ### 2597 # Annotation 2598 ### 2599 2600 def scan_databases( 2601 self, database_formats: list["parquet"], database_releases: list = ["current"] 2602 ) -> dict: 2603 """ 2604 The function `scan_databases` scans for available databases based on specified formats and 2605 releases. 2606 2607 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2608 of the databases to be scanned. In this case, the accepted format is "parquet" 2609 :type database_formats: list ["parquet"] 2610 :param database_releases: The `database_releases` parameter is a list that specifies the 2611 releases of the databases to be scanned. In the provided function, the default value for 2612 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2613 databases that are in the "current" 2614 :type database_releases: list 2615 :return: The function `scan_databases` returns a dictionary containing information about 2616 databases that match the specified formats and releases. 2617 """ 2618 2619 # Config 2620 config = self.get_config() 2621 2622 # Param 2623 param = self.get_param() 2624 2625 # Param - Assembly 2626 assembly = param.get("assembly", config.get("assembly", None)) 2627 if not assembly: 2628 assembly = DEFAULT_ASSEMBLY 2629 log.warning(f"Default assembly '{assembly}'") 2630 2631 # Scan for availabled databases 2632 log.info( 2633 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2634 ) 2635 databases_infos_dict = databases_infos( 2636 database_folder_releases=database_releases, 2637 database_formats=database_formats, 2638 assembly=assembly, 2639 config=config, 2640 ) 2641 log.info( 2642 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2643 ) 2644 2645 return databases_infos_dict 2646 2647 def annotation(self) -> None: 2648 """ 2649 It annotates the VCF file with the annotations specified in the config file. 2650 """ 2651 2652 # Config 2653 config = self.get_config() 2654 2655 # Param 2656 param = self.get_param() 2657 2658 # Param - Assembly 2659 assembly = param.get("assembly", config.get("assembly", None)) 2660 if not assembly: 2661 assembly = DEFAULT_ASSEMBLY 2662 log.warning(f"Default assembly '{assembly}'") 2663 2664 # annotations databases folders 2665 annotations_databases = set( 2666 config.get("folders", {}) 2667 .get("databases", {}) 2668 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2669 + config.get("folders", {}) 2670 .get("databases", {}) 2671 .get("parquet", ["~/howard/databases/parquet/current"]) 2672 + config.get("folders", {}) 2673 .get("databases", {}) 2674 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2675 ) 2676 2677 # Get param annotations 2678 if param.get("annotations", None) and isinstance( 2679 param.get("annotations", None), str 2680 ): 2681 log.debug(param.get("annotations", None)) 2682 param_annotation_list = param.get("annotations").split(",") 2683 else: 2684 param_annotation_list = [] 2685 2686 # Each tools param 2687 if param.get("annotation_parquet", None) != None: 2688 log.debug( 2689 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2690 ) 2691 if isinstance(param.get("annotation_parquet", None), list): 2692 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2693 else: 2694 param_annotation_list.append(param.get("annotation_parquet")) 2695 if param.get("annotation_snpsift", None) != None: 2696 if isinstance(param.get("annotation_snpsift", None), list): 2697 param_annotation_list.append( 2698 "snpsift:" 2699 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2700 ) 2701 else: 2702 param_annotation_list.append( 2703 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2704 ) 2705 if param.get("annotation_snpeff", None) != None: 2706 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2707 if param.get("annotation_bcftools", None) != None: 2708 if isinstance(param.get("annotation_bcftools", None), list): 2709 param_annotation_list.append( 2710 "bcftools:" 2711 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2712 ) 2713 else: 2714 param_annotation_list.append( 2715 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2716 ) 2717 if param.get("annotation_annovar", None) != None: 2718 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2719 if param.get("annotation_exomiser", None) != None: 2720 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2721 if param.get("annotation_splice", None) != None: 2722 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2723 2724 # Merge param annotations list 2725 param["annotations"] = ",".join(param_annotation_list) 2726 2727 # debug 2728 log.debug(f"param_annotations={param['annotations']}") 2729 2730 if param.get("annotations"): 2731 2732 # Log 2733 # log.info("Annotations - Check annotation parameters") 2734 2735 if not "annotation" in param: 2736 param["annotation"] = {} 2737 2738 # List of annotations parameters 2739 annotations_list_input = {} 2740 if isinstance(param.get("annotations", None), str): 2741 annotation_file_list = [ 2742 value for value in param.get("annotations", "").split(",") 2743 ] 2744 for annotation_file in annotation_file_list: 2745 annotations_list_input[annotation_file] = {"INFO": None} 2746 else: 2747 annotations_list_input = param.get("annotations", {}) 2748 2749 log.info(f"Quick Annotations:") 2750 for annotation_key in list(annotations_list_input.keys()): 2751 log.info(f" {annotation_key}") 2752 2753 # List of annotations and associated fields 2754 annotations_list = {} 2755 2756 for annotation_file in annotations_list_input: 2757 2758 # Explode annotations if ALL 2759 if ( 2760 annotation_file.upper() == "ALL" 2761 or annotation_file.upper().startswith("ALL:") 2762 ): 2763 2764 # check ALL parameters (formats, releases) 2765 annotation_file_split = annotation_file.split(":") 2766 database_formats = "parquet" 2767 database_releases = "current" 2768 for annotation_file_option in annotation_file_split[1:]: 2769 database_all_options_split = annotation_file_option.split("=") 2770 if database_all_options_split[0] == "format": 2771 database_formats = database_all_options_split[1].split("+") 2772 if database_all_options_split[0] == "release": 2773 database_releases = database_all_options_split[1].split("+") 2774 2775 # Scan for availabled databases 2776 databases_infos_dict = self.scan_databases( 2777 database_formats=database_formats, 2778 database_releases=database_releases, 2779 ) 2780 2781 # Add found databases in annotation parameters 2782 for database_infos in databases_infos_dict.keys(): 2783 annotations_list[database_infos] = {"INFO": None} 2784 2785 else: 2786 annotations_list[annotation_file] = annotations_list_input[ 2787 annotation_file 2788 ] 2789 2790 # Check each databases 2791 if len(annotations_list): 2792 2793 log.info( 2794 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 2795 ) 2796 2797 for annotation_file in annotations_list: 2798 2799 # Init 2800 annotations = annotations_list.get(annotation_file, None) 2801 2802 # Annotation snpEff 2803 if annotation_file.startswith("snpeff"): 2804 2805 log.debug(f"Quick Annotation snpEff") 2806 2807 if "snpeff" not in param["annotation"]: 2808 param["annotation"]["snpeff"] = {} 2809 2810 if "options" not in param["annotation"]["snpeff"]: 2811 param["annotation"]["snpeff"]["options"] = "" 2812 2813 # snpEff options in annotations 2814 param["annotation"]["snpeff"]["options"] = "".join( 2815 annotation_file.split(":")[1:] 2816 ) 2817 2818 # Annotation Annovar 2819 elif annotation_file.startswith("annovar"): 2820 2821 log.debug(f"Quick Annotation Annovar") 2822 2823 if "annovar" not in param["annotation"]: 2824 param["annotation"]["annovar"] = {} 2825 2826 if "annotations" not in param["annotation"]["annovar"]: 2827 param["annotation"]["annovar"]["annotations"] = {} 2828 2829 # Options 2830 annotation_file_split = annotation_file.split(":") 2831 for annotation_file_annotation in annotation_file_split[1:]: 2832 if annotation_file_annotation: 2833 param["annotation"]["annovar"]["annotations"][ 2834 annotation_file_annotation 2835 ] = annotations 2836 2837 # Annotation Exomiser 2838 elif annotation_file.startswith("exomiser"): 2839 2840 log.debug(f"Quick Annotation Exomiser") 2841 2842 param["annotation"]["exomiser"] = params_string_to_dict( 2843 annotation_file 2844 ) 2845 2846 # Annotation Splice 2847 elif annotation_file.startswith("splice"): 2848 2849 log.debug(f"Quick Annotation Splice") 2850 2851 param["annotation"]["splice"] = params_string_to_dict( 2852 annotation_file 2853 ) 2854 2855 # Annotation Parquet or BCFTOOLS 2856 else: 2857 2858 # Tools detection 2859 if annotation_file.startswith("bcftools:"): 2860 annotation_tool_initial = "bcftools" 2861 annotation_file = ":".join(annotation_file.split(":")[1:]) 2862 elif annotation_file.startswith("snpsift:"): 2863 annotation_tool_initial = "snpsift" 2864 annotation_file = ":".join(annotation_file.split(":")[1:]) 2865 else: 2866 annotation_tool_initial = None 2867 2868 # list of files 2869 annotation_file_list = annotation_file.replace("+", ":").split( 2870 ":" 2871 ) 2872 2873 for annotation_file in annotation_file_list: 2874 2875 if annotation_file: 2876 2877 # Annotation tool initial 2878 annotation_tool = annotation_tool_initial 2879 2880 # Find file 2881 annotation_file_found = None 2882 2883 # Expand user 2884 annotation_file = full_path(annotation_file) 2885 2886 if os.path.exists(annotation_file): 2887 annotation_file_found = annotation_file 2888 2889 else: 2890 # Find within assembly folders 2891 for annotations_database in annotations_databases: 2892 found_files = find_all( 2893 annotation_file, 2894 os.path.join( 2895 annotations_database, assembly 2896 ), 2897 ) 2898 if len(found_files) > 0: 2899 annotation_file_found = found_files[0] 2900 break 2901 if not annotation_file_found and not assembly: 2902 # Find within folders 2903 for ( 2904 annotations_database 2905 ) in annotations_databases: 2906 found_files = find_all( 2907 annotation_file, annotations_database 2908 ) 2909 if len(found_files) > 0: 2910 annotation_file_found = found_files[0] 2911 break 2912 log.debug( 2913 f"for {annotation_file} annotation_file_found={annotation_file_found}" 2914 ) 2915 2916 # Full path 2917 annotation_file_found = full_path(annotation_file_found) 2918 2919 if annotation_file_found: 2920 2921 database = Database(database=annotation_file_found) 2922 quick_annotation_format = database.get_format() 2923 quick_annotation_is_compressed = ( 2924 database.is_compressed() 2925 ) 2926 quick_annotation_is_indexed = os.path.exists( 2927 f"{annotation_file_found}.tbi" 2928 ) 2929 bcftools_preference = False 2930 2931 # Check Annotation Tool 2932 if not annotation_tool: 2933 if ( 2934 bcftools_preference 2935 and quick_annotation_format 2936 in ["vcf", "bed"] 2937 and quick_annotation_is_compressed 2938 and quick_annotation_is_indexed 2939 ): 2940 annotation_tool = "bcftools" 2941 elif quick_annotation_format in [ 2942 "vcf", 2943 "bed", 2944 "tsv", 2945 "tsv", 2946 "csv", 2947 "json", 2948 "tbl", 2949 "parquet", 2950 "duckdb", 2951 ]: 2952 annotation_tool = "parquet" 2953 else: 2954 log.error( 2955 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 2956 ) 2957 raise ValueError( 2958 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 2959 ) 2960 2961 log.debug( 2962 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 2963 ) 2964 2965 # Annotation Tool dispatch 2966 if annotation_tool: 2967 if annotation_tool not in param["annotation"]: 2968 param["annotation"][annotation_tool] = {} 2969 if ( 2970 "annotations" 2971 not in param["annotation"][annotation_tool] 2972 ): 2973 param["annotation"][annotation_tool][ 2974 "annotations" 2975 ] = {} 2976 param["annotation"][annotation_tool][ 2977 "annotations" 2978 ][annotation_file_found] = annotations 2979 2980 else: 2981 log.error( 2982 f"Quick Annotation File {annotation_file} does NOT exist" 2983 ) 2984 2985 self.set_param(param) 2986 2987 if param.get("annotation", None): 2988 log.info("Annotations") 2989 if param.get("annotation", {}).get("parquet", None): 2990 log.info("Annotations 'parquet'...") 2991 self.annotation_parquet() 2992 if param.get("annotation", {}).get("bcftools", None): 2993 log.info("Annotations 'bcftools'...") 2994 self.annotation_bcftools() 2995 if param.get("annotation", {}).get("snpsift", None): 2996 log.info("Annotations 'snpsift'...") 2997 self.annotation_snpsift() 2998 if param.get("annotation", {}).get("annovar", None): 2999 log.info("Annotations 'annovar'...") 3000 self.annotation_annovar() 3001 if param.get("annotation", {}).get("snpeff", None): 3002 log.info("Annotations 'snpeff'...") 3003 self.annotation_snpeff() 3004 if param.get("annotation", {}).get("exomiser", None) is not None: 3005 log.info("Annotations 'exomiser'...") 3006 self.annotation_exomiser() 3007 if param.get("annotation", {}).get("splice", None) is not None: 3008 log.info("Annotations 'splice' ...") 3009 self.annotation_splice() 3010 3011 # Explode INFOS fields into table fields 3012 if self.get_explode_infos(): 3013 self.explode_infos( 3014 prefix=self.get_explode_infos_prefix(), 3015 fields=self.get_explode_infos_fields(), 3016 force=True, 3017 ) 3018 3019 def annotation_snpsift(self, threads: int = None) -> None: 3020 """ 3021 This function annotate with bcftools 3022 3023 :param threads: Number of threads to use 3024 :return: the value of the variable "return_value". 3025 """ 3026 3027 # DEBUG 3028 log.debug("Start annotation with bcftools databases") 3029 3030 # Threads 3031 if not threads: 3032 threads = self.get_threads() 3033 log.debug("Threads: " + str(threads)) 3034 3035 # Config 3036 config = self.get_config() 3037 log.debug("Config: " + str(config)) 3038 3039 # Config - snpSift 3040 snpsift_bin_command = get_bin_command( 3041 bin="SnpSift.jar", 3042 tool="snpsift", 3043 bin_type="jar", 3044 config=config, 3045 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3046 ) 3047 if not snpsift_bin_command: 3048 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3049 log.error(msg_err) 3050 raise ValueError(msg_err) 3051 3052 # Config - bcftools 3053 bcftools_bin_command = get_bin_command( 3054 bin="bcftools", 3055 tool="bcftools", 3056 bin_type="bin", 3057 config=config, 3058 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3059 ) 3060 if not bcftools_bin_command: 3061 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3062 log.error(msg_err) 3063 raise ValueError(msg_err) 3064 3065 # Config - BCFTools databases folders 3066 databases_folders = set( 3067 self.get_config() 3068 .get("folders", {}) 3069 .get("databases", {}) 3070 .get("annotations", ["."]) 3071 + self.get_config() 3072 .get("folders", {}) 3073 .get("databases", {}) 3074 .get("bcftools", ["."]) 3075 ) 3076 log.debug("Databases annotations: " + str(databases_folders)) 3077 3078 # Param 3079 annotations = ( 3080 self.get_param() 3081 .get("annotation", {}) 3082 .get("snpsift", {}) 3083 .get("annotations", None) 3084 ) 3085 log.debug("Annotations: " + str(annotations)) 3086 3087 # Assembly 3088 assembly = self.get_param().get( 3089 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3090 ) 3091 3092 # Data 3093 table_variants = self.get_table_variants() 3094 3095 # Check if not empty 3096 log.debug("Check if not empty") 3097 sql_query_chromosomes = ( 3098 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3099 ) 3100 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3101 if not sql_query_chromosomes_df["count"][0]: 3102 log.info(f"VCF empty") 3103 return 3104 3105 # VCF header 3106 vcf_reader = self.get_header() 3107 log.debug("Initial header: " + str(vcf_reader.infos)) 3108 3109 # Existing annotations 3110 for vcf_annotation in self.get_header().infos: 3111 3112 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3113 log.debug( 3114 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3115 ) 3116 3117 if annotations: 3118 3119 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3120 3121 # Export VCF file 3122 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3123 3124 # Init 3125 commands = {} 3126 3127 for annotation in annotations: 3128 annotation_fields = annotations[annotation] 3129 3130 # Annotation Name 3131 annotation_name = os.path.basename(annotation) 3132 3133 if not annotation_fields: 3134 annotation_fields = {"INFO": None} 3135 3136 log.debug(f"Annotation '{annotation_name}'") 3137 log.debug( 3138 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3139 ) 3140 3141 # Create Database 3142 database = Database( 3143 database=annotation, 3144 databases_folders=databases_folders, 3145 assembly=assembly, 3146 ) 3147 3148 # Find files 3149 db_file = database.get_database() 3150 db_file = full_path(db_file) 3151 db_hdr_file = database.get_header_file() 3152 db_hdr_file = full_path(db_hdr_file) 3153 db_file_type = database.get_format() 3154 db_tbi_file = f"{db_file}.tbi" 3155 db_file_compressed = database.is_compressed() 3156 3157 # Check if compressed 3158 if not db_file_compressed: 3159 log.error( 3160 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3161 ) 3162 raise ValueError( 3163 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3164 ) 3165 3166 # Check if indexed 3167 if not os.path.exists(db_tbi_file): 3168 log.error( 3169 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3170 ) 3171 raise ValueError( 3172 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3173 ) 3174 3175 # Check index - try to create if not exists 3176 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3177 log.error("Annotation failed: database not valid") 3178 log.error(f"Annotation annotation file: {db_file}") 3179 log.error(f"Annotation annotation header: {db_hdr_file}") 3180 log.error(f"Annotation annotation index: {db_tbi_file}") 3181 raise ValueError( 3182 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3183 ) 3184 else: 3185 3186 log.debug( 3187 f"Annotation '{annotation}' - file: " 3188 + str(db_file) 3189 + " and " 3190 + str(db_hdr_file) 3191 ) 3192 3193 # Load header as VCF object 3194 db_hdr_vcf = Variants(input=db_hdr_file) 3195 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3196 log.debug( 3197 "Annotation database header: " 3198 + str(db_hdr_vcf_header_infos) 3199 ) 3200 3201 # For all fields in database 3202 annotation_fields_full = False 3203 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3204 annotation_fields = { 3205 key: key for key in db_hdr_vcf_header_infos 3206 } 3207 log.debug( 3208 "Annotation database header - All annotations added: " 3209 + str(annotation_fields) 3210 ) 3211 annotation_fields_full = True 3212 3213 # # Create file for field rename 3214 # log.debug("Create file for field rename") 3215 # tmp_rename = NamedTemporaryFile( 3216 # prefix=self.get_prefix(), 3217 # dir=self.get_tmp_dir(), 3218 # suffix=".rename", 3219 # delete=False, 3220 # ) 3221 # tmp_rename_name = tmp_rename.name 3222 # tmp_files.append(tmp_rename_name) 3223 3224 # Number of fields 3225 nb_annotation_field = 0 3226 annotation_list = [] 3227 annotation_infos_rename_list = [] 3228 3229 for annotation_field in annotation_fields: 3230 3231 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3232 annotation_fields_new_name = annotation_fields.get( 3233 annotation_field, annotation_field 3234 ) 3235 if not annotation_fields_new_name: 3236 annotation_fields_new_name = annotation_field 3237 3238 # Check if field is in DB and if field is not elready in input data 3239 if ( 3240 annotation_field in db_hdr_vcf.get_header().infos 3241 and annotation_fields_new_name 3242 not in self.get_header().infos 3243 ): 3244 3245 log.info( 3246 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3247 ) 3248 3249 # BCFTools annotate param to rename fields 3250 if annotation_field != annotation_fields_new_name: 3251 annotation_infos_rename_list.append( 3252 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3253 ) 3254 3255 # Add INFO field to header 3256 db_hdr_vcf_header_infos_number = ( 3257 db_hdr_vcf_header_infos[annotation_field].num or "." 3258 ) 3259 db_hdr_vcf_header_infos_type = ( 3260 db_hdr_vcf_header_infos[annotation_field].type 3261 or "String" 3262 ) 3263 db_hdr_vcf_header_infos_description = ( 3264 db_hdr_vcf_header_infos[annotation_field].desc 3265 or f"{annotation_field} description" 3266 ) 3267 db_hdr_vcf_header_infos_source = ( 3268 db_hdr_vcf_header_infos[annotation_field].source 3269 or "unknown" 3270 ) 3271 db_hdr_vcf_header_infos_version = ( 3272 db_hdr_vcf_header_infos[annotation_field].version 3273 or "unknown" 3274 ) 3275 3276 vcf_reader.infos[annotation_fields_new_name] = ( 3277 vcf.parser._Info( 3278 annotation_fields_new_name, 3279 db_hdr_vcf_header_infos_number, 3280 db_hdr_vcf_header_infos_type, 3281 db_hdr_vcf_header_infos_description, 3282 db_hdr_vcf_header_infos_source, 3283 db_hdr_vcf_header_infos_version, 3284 self.code_type_map[ 3285 db_hdr_vcf_header_infos_type 3286 ], 3287 ) 3288 ) 3289 3290 annotation_list.append(annotation_field) 3291 3292 nb_annotation_field += 1 3293 3294 else: 3295 3296 if ( 3297 annotation_field 3298 not in db_hdr_vcf.get_header().infos 3299 ): 3300 log.warning( 3301 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3302 ) 3303 if ( 3304 annotation_fields_new_name 3305 in self.get_header().infos 3306 ): 3307 log.warning( 3308 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3309 ) 3310 3311 log.info( 3312 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3313 ) 3314 3315 annotation_infos = ",".join(annotation_list) 3316 3317 if annotation_infos != "": 3318 3319 # Annotated VCF (and error file) 3320 tmp_annotation_vcf_name = os.path.join( 3321 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3322 ) 3323 tmp_annotation_vcf_name_err = ( 3324 tmp_annotation_vcf_name + ".err" 3325 ) 3326 3327 # Add fields to annotate 3328 if not annotation_fields_full: 3329 annotation_infos_option = f"-info {annotation_infos}" 3330 else: 3331 annotation_infos_option = "" 3332 3333 # Info fields rename 3334 if annotation_infos_rename_list: 3335 annotation_infos_rename = " -c " + ",".join( 3336 annotation_infos_rename_list 3337 ) 3338 else: 3339 annotation_infos_rename = "" 3340 3341 # Annotate command 3342 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3343 3344 # Add command 3345 commands[command_annotate] = tmp_annotation_vcf_name 3346 3347 if commands: 3348 3349 # Export VCF file 3350 self.export_variant_vcf( 3351 vcf_file=tmp_vcf_name, 3352 remove_info=True, 3353 add_samples=False, 3354 index=True, 3355 ) 3356 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3357 3358 # Num command 3359 nb_command = 0 3360 3361 # Annotate 3362 for command_annotate in commands: 3363 nb_command += 1 3364 log.info( 3365 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3366 ) 3367 log.debug(f"command_annotate={command_annotate}") 3368 run_parallel_commands([command_annotate], threads) 3369 3370 # Debug 3371 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3372 3373 # Update variants 3374 log.info( 3375 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3376 ) 3377 self.update_from_vcf(commands[command_annotate]) 3378 3379 def annotation_bcftools(self, threads: int = None) -> None: 3380 """ 3381 This function annotate with bcftools 3382 3383 :param threads: Number of threads to use 3384 :return: the value of the variable "return_value". 3385 """ 3386 3387 # DEBUG 3388 log.debug("Start annotation with bcftools databases") 3389 3390 # Threads 3391 if not threads: 3392 threads = self.get_threads() 3393 log.debug("Threads: " + str(threads)) 3394 3395 # Config 3396 config = self.get_config() 3397 log.debug("Config: " + str(config)) 3398 3399 # DEBUG 3400 delete_tmp = True 3401 if self.get_config().get("verbosity", "warning") in ["debug"]: 3402 delete_tmp = False 3403 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 3404 3405 # Config - BCFTools bin command 3406 bcftools_bin_command = get_bin_command( 3407 bin="bcftools", 3408 tool="bcftools", 3409 bin_type="bin", 3410 config=config, 3411 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3412 ) 3413 if not bcftools_bin_command: 3414 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3415 log.error(msg_err) 3416 raise ValueError(msg_err) 3417 3418 # Config - BCFTools databases folders 3419 databases_folders = set( 3420 self.get_config() 3421 .get("folders", {}) 3422 .get("databases", {}) 3423 .get("annotations", ["."]) 3424 + self.get_config() 3425 .get("folders", {}) 3426 .get("databases", {}) 3427 .get("bcftools", ["."]) 3428 ) 3429 log.debug("Databases annotations: " + str(databases_folders)) 3430 3431 # Param 3432 annotations = ( 3433 self.get_param() 3434 .get("annotation", {}) 3435 .get("bcftools", {}) 3436 .get("annotations", None) 3437 ) 3438 log.debug("Annotations: " + str(annotations)) 3439 3440 # Assembly 3441 assembly = self.get_param().get( 3442 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3443 ) 3444 3445 # Data 3446 table_variants = self.get_table_variants() 3447 3448 # Check if not empty 3449 log.debug("Check if not empty") 3450 sql_query_chromosomes = ( 3451 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3452 ) 3453 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3454 if not sql_query_chromosomes_df["count"][0]: 3455 log.info(f"VCF empty") 3456 return 3457 3458 # Export in VCF 3459 log.debug("Create initial file to annotate") 3460 tmp_vcf = NamedTemporaryFile( 3461 prefix=self.get_prefix(), 3462 dir=self.get_tmp_dir(), 3463 suffix=".vcf.gz", 3464 delete=False, 3465 ) 3466 tmp_vcf_name = tmp_vcf.name 3467 3468 # VCF header 3469 vcf_reader = self.get_header() 3470 log.debug("Initial header: " + str(vcf_reader.infos)) 3471 3472 # Existing annotations 3473 for vcf_annotation in self.get_header().infos: 3474 3475 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3476 log.debug( 3477 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3478 ) 3479 3480 if annotations: 3481 3482 tmp_ann_vcf_list = [] 3483 commands = [] 3484 tmp_files = [] 3485 err_files = [] 3486 3487 for annotation in annotations: 3488 annotation_fields = annotations[annotation] 3489 3490 # Annotation Name 3491 annotation_name = os.path.basename(annotation) 3492 3493 if not annotation_fields: 3494 annotation_fields = {"INFO": None} 3495 3496 log.debug(f"Annotation '{annotation_name}'") 3497 log.debug( 3498 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3499 ) 3500 3501 # Create Database 3502 database = Database( 3503 database=annotation, 3504 databases_folders=databases_folders, 3505 assembly=assembly, 3506 ) 3507 3508 # Find files 3509 db_file = database.get_database() 3510 db_file = full_path(db_file) 3511 db_hdr_file = database.get_header_file() 3512 db_hdr_file = full_path(db_hdr_file) 3513 db_file_type = database.get_format() 3514 db_tbi_file = f"{db_file}.tbi" 3515 db_file_compressed = database.is_compressed() 3516 3517 # Check if compressed 3518 if not db_file_compressed: 3519 log.error( 3520 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3521 ) 3522 raise ValueError( 3523 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3524 ) 3525 3526 # Check if indexed 3527 if not os.path.exists(db_tbi_file): 3528 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 3529 raise ValueError( 3530 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3531 ) 3532 3533 # Check index - try to create if not exists 3534 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3535 log.error("Annotation failed: database not valid") 3536 log.error(f"Annotation annotation file: {db_file}") 3537 log.error(f"Annotation annotation header: {db_hdr_file}") 3538 log.error(f"Annotation annotation index: {db_tbi_file}") 3539 raise ValueError( 3540 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3541 ) 3542 else: 3543 3544 log.debug( 3545 f"Annotation '{annotation}' - file: " 3546 + str(db_file) 3547 + " and " 3548 + str(db_hdr_file) 3549 ) 3550 3551 # Load header as VCF object 3552 db_hdr_vcf = Variants(input=db_hdr_file) 3553 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3554 log.debug( 3555 "Annotation database header: " + str(db_hdr_vcf_header_infos) 3556 ) 3557 3558 # For all fields in database 3559 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3560 annotation_fields = { 3561 key: key for key in db_hdr_vcf_header_infos 3562 } 3563 log.debug( 3564 "Annotation database header - All annotations added: " 3565 + str(annotation_fields) 3566 ) 3567 3568 # Number of fields 3569 nb_annotation_field = 0 3570 annotation_list = [] 3571 3572 for annotation_field in annotation_fields: 3573 3574 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3575 annotation_fields_new_name = annotation_fields.get( 3576 annotation_field, annotation_field 3577 ) 3578 if not annotation_fields_new_name: 3579 annotation_fields_new_name = annotation_field 3580 3581 # Check if field is in DB and if field is not elready in input data 3582 if ( 3583 annotation_field in db_hdr_vcf.get_header().infos 3584 and annotation_fields_new_name 3585 not in self.get_header().infos 3586 ): 3587 3588 log.info( 3589 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3590 ) 3591 3592 # Add INFO field to header 3593 db_hdr_vcf_header_infos_number = ( 3594 db_hdr_vcf_header_infos[annotation_field].num or "." 3595 ) 3596 db_hdr_vcf_header_infos_type = ( 3597 db_hdr_vcf_header_infos[annotation_field].type 3598 or "String" 3599 ) 3600 db_hdr_vcf_header_infos_description = ( 3601 db_hdr_vcf_header_infos[annotation_field].desc 3602 or f"{annotation_field} description" 3603 ) 3604 db_hdr_vcf_header_infos_source = ( 3605 db_hdr_vcf_header_infos[annotation_field].source 3606 or "unknown" 3607 ) 3608 db_hdr_vcf_header_infos_version = ( 3609 db_hdr_vcf_header_infos[annotation_field].version 3610 or "unknown" 3611 ) 3612 3613 vcf_reader.infos[annotation_fields_new_name] = ( 3614 vcf.parser._Info( 3615 annotation_fields_new_name, 3616 db_hdr_vcf_header_infos_number, 3617 db_hdr_vcf_header_infos_type, 3618 db_hdr_vcf_header_infos_description, 3619 db_hdr_vcf_header_infos_source, 3620 db_hdr_vcf_header_infos_version, 3621 self.code_type_map[db_hdr_vcf_header_infos_type], 3622 ) 3623 ) 3624 3625 # annotation_list.append(annotation_field) 3626 if annotation_field != annotation_fields_new_name: 3627 annotation_list.append( 3628 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3629 ) 3630 else: 3631 annotation_list.append(annotation_field) 3632 3633 nb_annotation_field += 1 3634 3635 else: 3636 3637 if annotation_field not in db_hdr_vcf.get_header().infos: 3638 log.warning( 3639 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 3640 ) 3641 if annotation_fields_new_name in self.get_header().infos: 3642 log.warning( 3643 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 3644 ) 3645 3646 log.info( 3647 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3648 ) 3649 3650 annotation_infos = ",".join(annotation_list) 3651 3652 if annotation_infos != "": 3653 3654 # Protect header for bcftools (remove "#CHROM" and variants line) 3655 log.debug("Protect Header file - remove #CHROM line if exists") 3656 tmp_header_vcf = NamedTemporaryFile( 3657 prefix=self.get_prefix(), 3658 dir=self.get_tmp_dir(), 3659 suffix=".hdr", 3660 delete=False, 3661 ) 3662 tmp_header_vcf_name = tmp_header_vcf.name 3663 tmp_files.append(tmp_header_vcf_name) 3664 # Command 3665 if db_hdr_file.endswith(".gz"): 3666 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3667 else: 3668 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3669 # Run 3670 run_parallel_commands([command_extract_header], 1) 3671 3672 # Find chomosomes 3673 log.debug("Find chromosomes ") 3674 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 3675 sql_query_chromosomes_df = self.get_query_to_df( 3676 sql_query_chromosomes 3677 ) 3678 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 3679 3680 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 3681 3682 # BED columns in the annotation file 3683 if db_file_type in ["bed"]: 3684 annotation_infos = "CHROM,POS,POS," + annotation_infos 3685 3686 for chrom in chomosomes_list: 3687 3688 # Create BED on initial VCF 3689 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 3690 tmp_bed = NamedTemporaryFile( 3691 prefix=self.get_prefix(), 3692 dir=self.get_tmp_dir(), 3693 suffix=".bed", 3694 delete=False, 3695 ) 3696 tmp_bed_name = tmp_bed.name 3697 tmp_files.append(tmp_bed_name) 3698 3699 # Detecte regions 3700 log.debug( 3701 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 3702 ) 3703 window = 1000000 3704 sql_query_intervals_for_bed = f""" 3705 SELECT \"#CHROM\", 3706 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 3707 \"POS\"+{window} 3708 FROM {table_variants} as table_variants 3709 WHERE table_variants.\"#CHROM\" = '{chrom}' 3710 """ 3711 regions = self.conn.execute( 3712 sql_query_intervals_for_bed 3713 ).fetchall() 3714 merged_regions = merge_regions(regions) 3715 log.debug( 3716 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 3717 ) 3718 3719 header = ["#CHROM", "START", "END"] 3720 with open(tmp_bed_name, "w") as f: 3721 # Write the header with tab delimiter 3722 f.write("\t".join(header) + "\n") 3723 for d in merged_regions: 3724 # Write each data row with tab delimiter 3725 f.write("\t".join(map(str, d)) + "\n") 3726 3727 # Tmp files 3728 tmp_annotation_vcf = NamedTemporaryFile( 3729 prefix=self.get_prefix(), 3730 dir=self.get_tmp_dir(), 3731 suffix=".vcf.gz", 3732 delete=False, 3733 ) 3734 tmp_annotation_vcf_name = tmp_annotation_vcf.name 3735 tmp_files.append(tmp_annotation_vcf_name) 3736 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 3737 tmp_annotation_vcf_name_err = ( 3738 tmp_annotation_vcf_name + ".err" 3739 ) 3740 err_files.append(tmp_annotation_vcf_name_err) 3741 3742 # Annotate Command 3743 log.debug( 3744 f"Annotation '{annotation}' - add bcftools command" 3745 ) 3746 3747 # Command 3748 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3749 3750 # Add command 3751 commands.append(command_annotate) 3752 3753 # if some commands 3754 if commands: 3755 3756 # Export VCF file 3757 self.export_variant_vcf( 3758 vcf_file=tmp_vcf_name, 3759 remove_info=True, 3760 add_samples=False, 3761 index=True, 3762 ) 3763 3764 # Threads 3765 # calculate threads for annotated commands 3766 if commands: 3767 threads_bcftools_annotate = round(threads / len(commands)) 3768 else: 3769 threads_bcftools_annotate = 1 3770 3771 if not threads_bcftools_annotate: 3772 threads_bcftools_annotate = 1 3773 3774 # Add threads option to bcftools commands 3775 if threads_bcftools_annotate > 1: 3776 commands_threaded = [] 3777 for command in commands: 3778 commands_threaded.append( 3779 command.replace( 3780 f"{bcftools_bin_command} annotate ", 3781 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 3782 ) 3783 ) 3784 commands = commands_threaded 3785 3786 # Command annotation multithreading 3787 log.debug(f"Annotation - Annotation commands: " + str(commands)) 3788 log.info( 3789 f"Annotation - Annotation multithreaded in " 3790 + str(len(commands)) 3791 + " commands" 3792 ) 3793 3794 run_parallel_commands(commands, threads) 3795 3796 # Merge 3797 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 3798 3799 if tmp_ann_vcf_list_cmd: 3800 3801 # Tmp file 3802 tmp_annotate_vcf = NamedTemporaryFile( 3803 prefix=self.get_prefix(), 3804 dir=self.get_tmp_dir(), 3805 suffix=".vcf.gz", 3806 delete=True, 3807 ) 3808 tmp_annotate_vcf_name = tmp_annotate_vcf.name 3809 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 3810 err_files.append(tmp_annotate_vcf_name_err) 3811 3812 # Tmp file remove command 3813 tmp_files_remove_command = "" 3814 if tmp_files: 3815 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 3816 3817 # Command merge 3818 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 3819 log.info( 3820 f"Annotation - Annotation merging " 3821 + str(len(commands)) 3822 + " annotated files" 3823 ) 3824 log.debug(f"Annotation - merge command: {merge_command}") 3825 run_parallel_commands([merge_command], 1) 3826 3827 # Error messages 3828 log.info(f"Error/Warning messages:") 3829 error_message_command_all = [] 3830 error_message_command_warning = [] 3831 error_message_command_err = [] 3832 for err_file in err_files: 3833 with open(err_file, "r") as f: 3834 for line in f: 3835 message = line.strip() 3836 error_message_command_all.append(message) 3837 if line.startswith("[W::"): 3838 error_message_command_warning.append(message) 3839 if line.startswith("[E::"): 3840 error_message_command_err.append( 3841 f"{err_file}: " + message 3842 ) 3843 # log info 3844 for message in list( 3845 set(error_message_command_err + error_message_command_warning) 3846 ): 3847 log.info(f" {message}") 3848 # debug info 3849 for message in list(set(error_message_command_all)): 3850 log.debug(f" {message}") 3851 # failed 3852 if len(error_message_command_err): 3853 log.error("Annotation failed: Error in commands") 3854 raise ValueError("Annotation failed: Error in commands") 3855 3856 # Update variants 3857 log.info(f"Annotation - Updating...") 3858 self.update_from_vcf(tmp_annotate_vcf_name) 3859 3860 def annotation_exomiser(self, threads: int = None) -> None: 3861 """ 3862 This function annotate with Exomiser 3863 3864 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 3865 - "analysis" (dict/file): 3866 Full analysis dictionnary parameters (see Exomiser docs). 3867 Either a dict, or a file in JSON or YAML format. 3868 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 3869 Default : None 3870 - "preset" (string): 3871 Analysis preset (available in config folder). 3872 Used if no full "analysis" is provided. 3873 Default: "exome" 3874 - "phenopacket" (dict/file): 3875 Samples and phenotipic features parameters (see Exomiser docs). 3876 Either a dict, or a file in JSON or YAML format. 3877 Default: None 3878 - "subject" (dict): 3879 Sample parameters (see Exomiser docs). 3880 Example: 3881 "subject": 3882 { 3883 "id": "ISDBM322017", 3884 "sex": "FEMALE" 3885 } 3886 Default: None 3887 - "sample" (string): 3888 Sample name to construct "subject" section: 3889 "subject": 3890 { 3891 "id": "<sample>", 3892 "sex": "UNKNOWN_SEX" 3893 } 3894 Default: None 3895 - "phenotypicFeatures" (dict) 3896 Phenotypic features to construct "subject" section. 3897 Example: 3898 "phenotypicFeatures": 3899 [ 3900 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 3901 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 3902 ] 3903 - "hpo" (list) 3904 List of HPO ids as phenotypic features. 3905 Example: 3906 "hpo": ['0001156', '0001363', '0011304', '0010055'] 3907 Default: [] 3908 - "outputOptions" (dict): 3909 Output options (see Exomiser docs). 3910 Default: 3911 "output_options" = 3912 { 3913 "outputContributingVariantsOnly": False, 3914 "numGenes": 0, 3915 "outputFormats": ["TSV_VARIANT", "VCF"] 3916 } 3917 - "transcript_source" (string): 3918 Transcript source (either "refseq", "ucsc", "ensembl") 3919 Default: "refseq" 3920 - "exomiser_to_info" (boolean): 3921 Add exomiser TSV file columns as INFO fields in VCF. 3922 Default: False 3923 - "release" (string): 3924 Exomise database release. 3925 If not exists, database release will be downloaded (take a while). 3926 Default: None (provided by application.properties configuration file) 3927 - "exomiser_application_properties" (file): 3928 Exomiser configuration file (see Exomiser docs). 3929 Useful to automatically download databases (especially for specific genome databases). 3930 3931 Notes: 3932 - If no sample in parameters, first sample in VCF will be chosen 3933 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 3934 3935 :param threads: The number of threads to use 3936 :return: None. 3937 """ 3938 3939 # DEBUG 3940 log.debug("Start annotation with Exomiser databases") 3941 3942 # Threads 3943 if not threads: 3944 threads = self.get_threads() 3945 log.debug("Threads: " + str(threads)) 3946 3947 # Config 3948 config = self.get_config() 3949 log.debug("Config: " + str(config)) 3950 3951 # Config - Folders - Databases 3952 databases_folders = ( 3953 config.get("folders", {}) 3954 .get("databases", {}) 3955 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 3956 ) 3957 databases_folders = full_path(databases_folders) 3958 if not os.path.exists(databases_folders): 3959 log.error(f"Databases annotations: {databases_folders} NOT found") 3960 log.debug("Databases annotations: " + str(databases_folders)) 3961 3962 # Config - Exomiser 3963 exomiser_bin_command = get_bin_command( 3964 bin="exomiser-cli*.jar", 3965 tool="exomiser", 3966 bin_type="jar", 3967 config=config, 3968 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 3969 ) 3970 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 3971 if not exomiser_bin_command: 3972 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 3973 log.error(msg_err) 3974 raise ValueError(msg_err) 3975 3976 # Param 3977 param = self.get_param() 3978 log.debug("Param: " + str(param)) 3979 3980 # Param - Exomiser 3981 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 3982 log.debug(f"Param Exomiser: {param_exomiser}") 3983 3984 # Param - Assembly 3985 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 3986 log.debug("Assembly: " + str(assembly)) 3987 3988 # Data 3989 table_variants = self.get_table_variants() 3990 3991 # Check if not empty 3992 log.debug("Check if not empty") 3993 sql_query_chromosomes = ( 3994 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3995 ) 3996 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 3997 log.info(f"VCF empty") 3998 return False 3999 4000 # VCF header 4001 vcf_reader = self.get_header() 4002 log.debug("Initial header: " + str(vcf_reader.infos)) 4003 4004 # Samples 4005 samples = self.get_header_sample_list() 4006 if not samples: 4007 log.error("No Samples in VCF") 4008 return False 4009 log.debug(f"Samples: {samples}") 4010 4011 # Memory limit 4012 memory_limit = self.get_memory("8G") 4013 log.debug(f"memory_limit: {memory_limit}") 4014 4015 # Exomiser java options 4016 exomiser_java_options = ( 4017 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4018 ) 4019 log.debug(f"Exomiser java options: {exomiser_java_options}") 4020 4021 # Download Exomiser (if not exists) 4022 exomiser_release = param_exomiser.get("release", None) 4023 exomiser_application_properties = param_exomiser.get( 4024 "exomiser_application_properties", None 4025 ) 4026 databases_download_exomiser( 4027 assemblies=[assembly], 4028 exomiser_folder=databases_folders, 4029 exomiser_release=exomiser_release, 4030 exomiser_phenotype_release=exomiser_release, 4031 exomiser_application_properties=exomiser_application_properties, 4032 ) 4033 4034 # Force annotation 4035 force_update_annotation = True 4036 4037 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4038 log.debug("Start annotation Exomiser") 4039 4040 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4041 4042 # tmp_dir = "/tmp/exomiser" 4043 4044 ### ANALYSIS ### 4045 ################ 4046 4047 # Create analysis.json through analysis dict 4048 # either analysis in param or by default 4049 # depending on preset exome/genome) 4050 4051 # Init analysis dict 4052 param_exomiser_analysis_dict = {} 4053 4054 # analysis from param 4055 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4056 param_exomiser_analysis = full_path(param_exomiser_analysis) 4057 4058 # If analysis in param -> load anlaysis json 4059 if param_exomiser_analysis: 4060 4061 # If param analysis is a file and exists 4062 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4063 param_exomiser_analysis 4064 ): 4065 # Load analysis file into analysis dict (either yaml or json) 4066 with open(param_exomiser_analysis) as json_file: 4067 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4068 4069 # If param analysis is a dict 4070 elif isinstance(param_exomiser_analysis, dict): 4071 # Load analysis dict into analysis dict (either yaml or json) 4072 param_exomiser_analysis_dict = param_exomiser_analysis 4073 4074 # Error analysis type 4075 else: 4076 log.error(f"Analysis type unknown. Check param file.") 4077 raise ValueError(f"Analysis type unknown. Check param file.") 4078 4079 # Case no input analysis config file/dict 4080 # Use preset (exome/genome) to open default config file 4081 if not param_exomiser_analysis_dict: 4082 4083 # default preset 4084 default_preset = "exome" 4085 4086 # Get param preset or default preset 4087 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4088 4089 # Try to find if preset is a file 4090 if os.path.exists(param_exomiser_preset): 4091 # Preset file is provided in full path 4092 param_exomiser_analysis_default_config_file = ( 4093 param_exomiser_preset 4094 ) 4095 # elif os.path.exists(full_path(param_exomiser_preset)): 4096 # # Preset file is provided in full path 4097 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4098 elif os.path.exists( 4099 os.path.join(folder_config, param_exomiser_preset) 4100 ): 4101 # Preset file is provided a basename in config folder (can be a path with subfolders) 4102 param_exomiser_analysis_default_config_file = os.path.join( 4103 folder_config, param_exomiser_preset 4104 ) 4105 else: 4106 # Construct preset file 4107 param_exomiser_analysis_default_config_file = os.path.join( 4108 folder_config, 4109 f"preset-{param_exomiser_preset}-analysis.json", 4110 ) 4111 4112 # If preset file exists 4113 param_exomiser_analysis_default_config_file = full_path( 4114 param_exomiser_analysis_default_config_file 4115 ) 4116 if os.path.exists(param_exomiser_analysis_default_config_file): 4117 # Load prest file into analysis dict (either yaml or json) 4118 with open( 4119 param_exomiser_analysis_default_config_file 4120 ) as json_file: 4121 # param_exomiser_analysis_dict[""] = json.load(json_file) 4122 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4123 json_file 4124 ) 4125 4126 # Error preset file 4127 else: 4128 log.error( 4129 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4130 ) 4131 raise ValueError( 4132 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4133 ) 4134 4135 # If no analysis dict created 4136 if not param_exomiser_analysis_dict: 4137 log.error(f"No analysis config") 4138 raise ValueError(f"No analysis config") 4139 4140 # Log 4141 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4142 4143 ### PHENOPACKET ### 4144 ################### 4145 4146 # If no PhenoPacket in analysis dict -> check in param 4147 if "phenopacket" not in param_exomiser_analysis_dict: 4148 4149 # If PhenoPacket in param -> load anlaysis json 4150 if param_exomiser.get("phenopacket", None): 4151 4152 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4153 param_exomiser_phenopacket = full_path( 4154 param_exomiser_phenopacket 4155 ) 4156 4157 # If param phenopacket is a file and exists 4158 if isinstance( 4159 param_exomiser_phenopacket, str 4160 ) and os.path.exists(param_exomiser_phenopacket): 4161 # Load phenopacket file into analysis dict (either yaml or json) 4162 with open(param_exomiser_phenopacket) as json_file: 4163 param_exomiser_analysis_dict["phenopacket"] = ( 4164 yaml.safe_load(json_file) 4165 ) 4166 4167 # If param phenopacket is a dict 4168 elif isinstance(param_exomiser_phenopacket, dict): 4169 # Load phenopacket dict into analysis dict (either yaml or json) 4170 param_exomiser_analysis_dict["phenopacket"] = ( 4171 param_exomiser_phenopacket 4172 ) 4173 4174 # Error phenopacket type 4175 else: 4176 log.error(f"Phenopacket type unknown. Check param file.") 4177 raise ValueError( 4178 f"Phenopacket type unknown. Check param file." 4179 ) 4180 4181 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4182 if "phenopacket" not in param_exomiser_analysis_dict: 4183 4184 # Init PhenoPacket 4185 param_exomiser_analysis_dict["phenopacket"] = { 4186 "id": "analysis", 4187 "proband": {}, 4188 } 4189 4190 ### Add subject ### 4191 4192 # If subject exists 4193 param_exomiser_subject = param_exomiser.get("subject", {}) 4194 4195 # If subject not exists -> found sample ID 4196 if not param_exomiser_subject: 4197 4198 # Found sample ID in param 4199 sample = param_exomiser.get("sample", None) 4200 4201 # Find sample ID (first sample) 4202 if not sample: 4203 sample_list = self.get_header_sample_list() 4204 if len(sample_list) > 0: 4205 sample = sample_list[0] 4206 else: 4207 log.error(f"No sample found") 4208 raise ValueError(f"No sample found") 4209 4210 # Create subject 4211 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4212 4213 # Add to dict 4214 param_exomiser_analysis_dict["phenopacket"][ 4215 "subject" 4216 ] = param_exomiser_subject 4217 4218 ### Add "phenotypicFeatures" ### 4219 4220 # If phenotypicFeatures exists 4221 param_exomiser_phenotypicfeatures = param_exomiser.get( 4222 "phenotypicFeatures", [] 4223 ) 4224 4225 # If phenotypicFeatures not exists -> Try to infer from hpo list 4226 if not param_exomiser_phenotypicfeatures: 4227 4228 # Found HPO in param 4229 param_exomiser_hpo = param_exomiser.get("hpo", []) 4230 4231 # Split HPO if list in string format separated by comma 4232 if isinstance(param_exomiser_hpo, str): 4233 param_exomiser_hpo = param_exomiser_hpo.split(",") 4234 4235 # Create HPO list 4236 for hpo in param_exomiser_hpo: 4237 hpo_clean = re.sub("[^0-9]", "", hpo) 4238 param_exomiser_phenotypicfeatures.append( 4239 { 4240 "type": { 4241 "id": f"HP:{hpo_clean}", 4242 "label": f"HP:{hpo_clean}", 4243 } 4244 } 4245 ) 4246 4247 # Add to dict 4248 param_exomiser_analysis_dict["phenopacket"][ 4249 "phenotypicFeatures" 4250 ] = param_exomiser_phenotypicfeatures 4251 4252 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4253 if not param_exomiser_phenotypicfeatures: 4254 for step in param_exomiser_analysis_dict.get( 4255 "analysis", {} 4256 ).get("steps", []): 4257 if "hiPhivePrioritiser" in step: 4258 param_exomiser_analysis_dict.get("analysis", {}).get( 4259 "steps", [] 4260 ).remove(step) 4261 4262 ### Add Input File ### 4263 4264 # Initial file name and htsFiles 4265 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4266 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4267 { 4268 "uri": tmp_vcf_name, 4269 "htsFormat": "VCF", 4270 "genomeAssembly": assembly, 4271 } 4272 ] 4273 4274 ### Add metaData ### 4275 4276 # If metaData not in analysis dict 4277 if "metaData" not in param_exomiser_analysis_dict: 4278 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4279 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4280 "createdBy": "howard", 4281 "phenopacketSchemaVersion": 1, 4282 } 4283 4284 ### OutputOptions ### 4285 4286 # Init output result folder 4287 output_results = os.path.join(tmp_dir, "results") 4288 4289 # If no outputOptions in analysis dict 4290 if "outputOptions" not in param_exomiser_analysis_dict: 4291 4292 # default output formats 4293 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4294 4295 # Get outputOptions in param 4296 output_options = param_exomiser.get("outputOptions", None) 4297 4298 # If no output_options in param -> check 4299 if not output_options: 4300 output_options = { 4301 "outputContributingVariantsOnly": False, 4302 "numGenes": 0, 4303 "outputFormats": defaut_output_formats, 4304 } 4305 4306 # Replace outputDirectory in output options 4307 output_options["outputDirectory"] = output_results 4308 output_options["outputFileName"] = "howard" 4309 4310 # Add outputOptions in analysis dict 4311 param_exomiser_analysis_dict["outputOptions"] = output_options 4312 4313 else: 4314 4315 # Replace output_results and output format (if exists in param) 4316 param_exomiser_analysis_dict["outputOptions"][ 4317 "outputDirectory" 4318 ] = output_results 4319 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4320 list( 4321 set( 4322 param_exomiser_analysis_dict.get( 4323 "outputOptions", {} 4324 ).get("outputFormats", []) 4325 + ["TSV_VARIANT", "VCF"] 4326 ) 4327 ) 4328 ) 4329 4330 # log 4331 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4332 4333 ### ANALYSIS FILE ### 4334 ##################### 4335 4336 ### Full JSON analysis config file ### 4337 4338 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4339 with open(exomiser_analysis, "w") as fp: 4340 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4341 4342 ### SPLIT analysis and sample config files 4343 4344 # Splitted analysis dict 4345 param_exomiser_analysis_dict_for_split = ( 4346 param_exomiser_analysis_dict.copy() 4347 ) 4348 4349 # Phenopacket JSON file 4350 exomiser_analysis_phenopacket = os.path.join( 4351 tmp_dir, "analysis_phenopacket.json" 4352 ) 4353 with open(exomiser_analysis_phenopacket, "w") as fp: 4354 json.dump( 4355 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4356 fp, 4357 indent=4, 4358 ) 4359 4360 # Analysis JSON file without Phenopacket parameters 4361 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4362 exomiser_analysis_analysis = os.path.join( 4363 tmp_dir, "analysis_analysis.json" 4364 ) 4365 with open(exomiser_analysis_analysis, "w") as fp: 4366 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4367 4368 ### INITAL VCF file ### 4369 ####################### 4370 4371 ### Create list of samples to use and include inti initial VCF file #### 4372 4373 # Subject (main sample) 4374 # Get sample ID in analysis dict 4375 sample_subject = ( 4376 param_exomiser_analysis_dict.get("phenopacket", {}) 4377 .get("subject", {}) 4378 .get("id", None) 4379 ) 4380 sample_proband = ( 4381 param_exomiser_analysis_dict.get("phenopacket", {}) 4382 .get("proband", {}) 4383 .get("subject", {}) 4384 .get("id", None) 4385 ) 4386 sample = [] 4387 if sample_subject: 4388 sample.append(sample_subject) 4389 if sample_proband: 4390 sample.append(sample_proband) 4391 4392 # Get sample ID within Pedigree 4393 pedigree_persons_list = ( 4394 param_exomiser_analysis_dict.get("phenopacket", {}) 4395 .get("pedigree", {}) 4396 .get("persons", {}) 4397 ) 4398 4399 # Create list with all sample ID in pedigree (if exists) 4400 pedigree_persons = [] 4401 for person in pedigree_persons_list: 4402 pedigree_persons.append(person.get("individualId")) 4403 4404 # Concat subject sample ID and samples ID in pedigreesamples 4405 samples = list(set(sample + pedigree_persons)) 4406 4407 # Check if sample list is not empty 4408 if not samples: 4409 log.error(f"No samples found") 4410 raise ValueError(f"No samples found") 4411 4412 # Create VCF with sample (either sample in param or first one by default) 4413 # Export VCF file 4414 self.export_variant_vcf( 4415 vcf_file=tmp_vcf_name, 4416 remove_info=True, 4417 add_samples=True, 4418 list_samples=samples, 4419 index=False, 4420 ) 4421 4422 ### Execute Exomiser ### 4423 ######################## 4424 4425 # Init command 4426 exomiser_command = "" 4427 4428 # Command exomiser options 4429 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 4430 4431 # Release 4432 exomiser_release = param_exomiser.get("release", None) 4433 if exomiser_release: 4434 # phenotype data version 4435 exomiser_options += ( 4436 f" --exomiser.phenotype.data-version={exomiser_release} " 4437 ) 4438 # data version 4439 exomiser_options += ( 4440 f" --exomiser.{assembly}.data-version={exomiser_release} " 4441 ) 4442 # variant white list 4443 variant_white_list_file = ( 4444 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 4445 ) 4446 if os.path.exists( 4447 os.path.join( 4448 databases_folders, assembly, variant_white_list_file 4449 ) 4450 ): 4451 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 4452 4453 # transcript_source 4454 transcript_source = param_exomiser.get( 4455 "transcript_source", None 4456 ) # ucsc, refseq, ensembl 4457 if transcript_source: 4458 exomiser_options += ( 4459 f" --exomiser.{assembly}.transcript-source={transcript_source} " 4460 ) 4461 4462 # If analysis contain proband param 4463 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 4464 "proband", {} 4465 ): 4466 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 4467 4468 # If no proband (usually uniq sample) 4469 else: 4470 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 4471 4472 # Log 4473 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 4474 4475 # Run command 4476 result = subprocess.call( 4477 exomiser_command_analysis.split(), stdout=subprocess.PIPE 4478 ) 4479 if result: 4480 log.error("Exomiser command failed") 4481 raise ValueError("Exomiser command failed") 4482 4483 ### RESULTS ### 4484 ############### 4485 4486 ### Annotate with TSV fields ### 4487 4488 # Init result tsv file 4489 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 4490 4491 # Init result tsv file 4492 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 4493 4494 # Parse TSV file and explode columns in INFO field 4495 if exomiser_to_info and os.path.exists(output_results_tsv): 4496 4497 # Log 4498 log.debug("Exomiser columns to VCF INFO field") 4499 4500 # Retrieve columns and types 4501 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 4502 output_results_tsv_df = self.get_query_to_df(query) 4503 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 4504 4505 # Init concat fields for update 4506 sql_query_update_concat_fields = [] 4507 4508 # Fields to avoid 4509 fields_to_avoid = [ 4510 "CONTIG", 4511 "START", 4512 "END", 4513 "REF", 4514 "ALT", 4515 "QUAL", 4516 "FILTER", 4517 "GENOTYPE", 4518 ] 4519 4520 # List all columns to add into header 4521 for header_column in output_results_tsv_columns: 4522 4523 # If header column is enable 4524 if header_column not in fields_to_avoid: 4525 4526 # Header info type 4527 header_info_type = "String" 4528 header_column_df = output_results_tsv_df[header_column] 4529 header_column_df_dtype = header_column_df.dtype 4530 if header_column_df_dtype == object: 4531 if ( 4532 pd.to_numeric(header_column_df, errors="coerce") 4533 .notnull() 4534 .all() 4535 ): 4536 header_info_type = "Float" 4537 else: 4538 header_info_type = "Integer" 4539 4540 # Header info 4541 characters_to_validate = ["-"] 4542 pattern = "[" + "".join(characters_to_validate) + "]" 4543 header_info_name = re.sub( 4544 pattern, 4545 "_", 4546 f"Exomiser_{header_column}".replace("#", ""), 4547 ) 4548 header_info_number = "." 4549 header_info_description = ( 4550 f"Exomiser {header_column} annotation" 4551 ) 4552 header_info_source = "Exomiser" 4553 header_info_version = "unknown" 4554 header_info_code = CODE_TYPE_MAP[header_info_type] 4555 vcf_reader.infos[header_info_name] = vcf.parser._Info( 4556 header_info_name, 4557 header_info_number, 4558 header_info_type, 4559 header_info_description, 4560 header_info_source, 4561 header_info_version, 4562 header_info_code, 4563 ) 4564 4565 # Add field to add for update to concat fields 4566 sql_query_update_concat_fields.append( 4567 f""" 4568 CASE 4569 WHEN table_parquet."{header_column}" NOT IN ('','.') 4570 THEN concat( 4571 '{header_info_name}=', 4572 table_parquet."{header_column}", 4573 ';' 4574 ) 4575 4576 ELSE '' 4577 END 4578 """ 4579 ) 4580 4581 # Update query 4582 sql_query_update = f""" 4583 UPDATE {table_variants} as table_variants 4584 SET INFO = concat( 4585 CASE 4586 WHEN INFO NOT IN ('', '.') 4587 THEN INFO 4588 ELSE '' 4589 END, 4590 CASE 4591 WHEN table_variants.INFO NOT IN ('','.') 4592 THEN ';' 4593 ELSE '' 4594 END, 4595 ( 4596 SELECT 4597 concat( 4598 {",".join(sql_query_update_concat_fields)} 4599 ) 4600 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 4601 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 4602 AND table_parquet.\"START\" = table_variants.\"POS\" 4603 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 4604 AND table_parquet.\"REF\" = table_variants.\"REF\" 4605 ) 4606 ) 4607 ; 4608 """ 4609 4610 # Update 4611 self.conn.execute(sql_query_update) 4612 4613 ### Annotate with VCF INFO field ### 4614 4615 # Init result VCF file 4616 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 4617 4618 # If VCF exists 4619 if os.path.exists(output_results_vcf): 4620 4621 # Log 4622 log.debug("Exomiser result VCF update variants") 4623 4624 # Find Exomiser INFO field annotation in header 4625 with gzip.open(output_results_vcf, "rt") as f: 4626 header_list = self.read_vcf_header(f) 4627 exomiser_vcf_header = vcf.Reader( 4628 io.StringIO("\n".join(header_list)) 4629 ) 4630 4631 # Add annotation INFO field to header 4632 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 4633 4634 # Update variants with VCF 4635 self.update_from_vcf(output_results_vcf) 4636 4637 return True 4638 4639 def annotation_snpeff(self, threads: int = None) -> None: 4640 """ 4641 This function annotate with snpEff 4642 4643 :param threads: The number of threads to use 4644 :return: the value of the variable "return_value". 4645 """ 4646 4647 # DEBUG 4648 log.debug("Start annotation with snpeff databases") 4649 4650 # Threads 4651 if not threads: 4652 threads = self.get_threads() 4653 log.debug("Threads: " + str(threads)) 4654 4655 # DEBUG 4656 delete_tmp = True 4657 if self.get_config().get("verbosity", "warning") in ["debug"]: 4658 delete_tmp = False 4659 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4660 4661 # Config 4662 config = self.get_config() 4663 log.debug("Config: " + str(config)) 4664 4665 # Config - Folders - Databases 4666 databases_folders = ( 4667 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 4668 ) 4669 log.debug("Databases annotations: " + str(databases_folders)) 4670 4671 # # Config - Java 4672 # java_bin = get_bin( 4673 # tool="java", 4674 # bin="java", 4675 # bin_type="bin", 4676 # config=config, 4677 # default_folder="/usr/bin", 4678 # ) 4679 # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))): 4680 # log.error(f"Annotation failed: no java bin '{java_bin}'") 4681 # raise ValueError(f"Annotation failed: no java bin '{java_bin}'") 4682 4683 # # Config - snpEff bin 4684 # snpeff_jar = get_bin( 4685 # tool="snpeff", 4686 # bin="snpEff.jar", 4687 # bin_type="jar", 4688 # config=config, 4689 # default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4690 # ) 4691 # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))): 4692 # log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4693 # raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4694 4695 # Config - snpEff bin command 4696 snpeff_bin_command = get_bin_command( 4697 bin="snpEff.jar", 4698 tool="snpeff", 4699 bin_type="jar", 4700 config=config, 4701 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4702 ) 4703 if not snpeff_bin_command: 4704 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 4705 log.error(msg_err) 4706 raise ValueError(msg_err) 4707 4708 # Config - snpEff databases 4709 snpeff_databases = ( 4710 config.get("folders", {}) 4711 .get("databases", {}) 4712 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 4713 ) 4714 snpeff_databases = full_path(snpeff_databases) 4715 if snpeff_databases is not None and snpeff_databases != "": 4716 log.debug(f"Create snpEff databases folder") 4717 if not os.path.exists(snpeff_databases): 4718 os.makedirs(snpeff_databases) 4719 4720 # Param 4721 param = self.get_param() 4722 log.debug("Param: " + str(param)) 4723 4724 # Param 4725 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 4726 log.debug("Options: " + str(options)) 4727 4728 # Param - Assembly 4729 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4730 4731 # Param - Options 4732 snpeff_options = ( 4733 param.get("annotation", {}).get("snpeff", {}).get("options", "") 4734 ) 4735 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 4736 snpeff_csvstats = ( 4737 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 4738 ) 4739 if snpeff_stats: 4740 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 4741 snpeff_stats = full_path(snpeff_stats) 4742 snpeff_options += f" -stats {snpeff_stats}" 4743 if snpeff_csvstats: 4744 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 4745 snpeff_csvstats = full_path(snpeff_csvstats) 4746 snpeff_options += f" -csvStats {snpeff_csvstats}" 4747 4748 # Data 4749 table_variants = self.get_table_variants() 4750 4751 # Check if not empty 4752 log.debug("Check if not empty") 4753 sql_query_chromosomes = ( 4754 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4755 ) 4756 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 4757 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4758 log.info(f"VCF empty") 4759 return 4760 4761 # Export in VCF 4762 log.debug("Create initial file to annotate") 4763 tmp_vcf = NamedTemporaryFile( 4764 prefix=self.get_prefix(), 4765 dir=self.get_tmp_dir(), 4766 suffix=".vcf.gz", 4767 delete=True, 4768 ) 4769 tmp_vcf_name = tmp_vcf.name 4770 4771 # VCF header 4772 vcf_reader = self.get_header() 4773 log.debug("Initial header: " + str(vcf_reader.infos)) 4774 4775 # Existing annotations 4776 for vcf_annotation in self.get_header().infos: 4777 4778 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4779 log.debug( 4780 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4781 ) 4782 4783 # Memory limit 4784 # if config.get("memory", None): 4785 # memory_limit = config.get("memory", "8G") 4786 # else: 4787 # memory_limit = "8G" 4788 memory_limit = self.get_memory("8G") 4789 log.debug(f"memory_limit: {memory_limit}") 4790 4791 # snpEff java options 4792 snpeff_java_options = ( 4793 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4794 ) 4795 log.debug(f"Exomiser java options: {snpeff_java_options}") 4796 4797 force_update_annotation = True 4798 4799 if "ANN" not in self.get_header().infos or force_update_annotation: 4800 4801 # Check snpEff database 4802 log.debug(f"Check snpEff databases {[assembly]}") 4803 databases_download_snpeff( 4804 folder=snpeff_databases, assemblies=[assembly], config=config 4805 ) 4806 4807 # Export VCF file 4808 self.export_variant_vcf( 4809 vcf_file=tmp_vcf_name, 4810 remove_info=True, 4811 add_samples=False, 4812 index=True, 4813 ) 4814 4815 # Tmp file 4816 err_files = [] 4817 tmp_annotate_vcf = NamedTemporaryFile( 4818 prefix=self.get_prefix(), 4819 dir=self.get_tmp_dir(), 4820 suffix=".vcf", 4821 delete=False, 4822 ) 4823 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4824 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4825 err_files.append(tmp_annotate_vcf_name_err) 4826 4827 # Command 4828 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 4829 log.debug(f"Annotation - snpEff command: {snpeff_command}") 4830 run_parallel_commands([snpeff_command], 1) 4831 4832 # Error messages 4833 log.info(f"Error/Warning messages:") 4834 error_message_command_all = [] 4835 error_message_command_warning = [] 4836 error_message_command_err = [] 4837 for err_file in err_files: 4838 with open(err_file, "r") as f: 4839 for line in f: 4840 message = line.strip() 4841 error_message_command_all.append(message) 4842 if line.startswith("[W::"): 4843 error_message_command_warning.append(message) 4844 if line.startswith("[E::"): 4845 error_message_command_err.append(f"{err_file}: " + message) 4846 # log info 4847 for message in list( 4848 set(error_message_command_err + error_message_command_warning) 4849 ): 4850 log.info(f" {message}") 4851 # debug info 4852 for message in list(set(error_message_command_all)): 4853 log.debug(f" {message}") 4854 # failed 4855 if len(error_message_command_err): 4856 log.error("Annotation failed: Error in commands") 4857 raise ValueError("Annotation failed: Error in commands") 4858 4859 # Find annotation in header 4860 with open(tmp_annotate_vcf_name, "rt") as f: 4861 header_list = self.read_vcf_header(f) 4862 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 4863 4864 for ann in annovar_vcf_header.infos: 4865 if ann not in self.get_header().infos: 4866 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 4867 4868 # Update variants 4869 log.info(f"Annotation - Updating...") 4870 self.update_from_vcf(tmp_annotate_vcf_name) 4871 4872 else: 4873 if "ANN" in self.get_header().infos: 4874 log.debug(f"Existing snpEff annotations in VCF") 4875 if force_update_annotation: 4876 log.debug(f"Existing snpEff annotations in VCF - annotation forced") 4877 4878 def annotation_annovar(self, threads: int = None) -> None: 4879 """ 4880 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 4881 annotations 4882 4883 :param threads: number of threads to use 4884 :return: the value of the variable "return_value". 4885 """ 4886 4887 # DEBUG 4888 log.debug("Start annotation with Annovar databases") 4889 4890 # Threads 4891 if not threads: 4892 threads = self.get_threads() 4893 log.debug("Threads: " + str(threads)) 4894 4895 # Tmp en Err files 4896 tmp_files = [] 4897 err_files = [] 4898 4899 # DEBUG 4900 delete_tmp = True 4901 if self.get_config().get("verbosity", "warning") in ["debug"]: 4902 delete_tmp = False 4903 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4904 4905 # Config 4906 config = self.get_config() 4907 log.debug("Config: " + str(config)) 4908 4909 # Config - Folders - Databases 4910 databases_folders = ( 4911 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 4912 ) 4913 log.debug("Databases annotations: " + str(databases_folders)) 4914 4915 # Config - annovar bin command 4916 annovar_bin_command = get_bin_command( 4917 bin="table_annovar.pl", 4918 tool="annovar", 4919 bin_type="perl", 4920 config=config, 4921 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 4922 ) 4923 if not annovar_bin_command: 4924 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 4925 log.error(msg_err) 4926 raise ValueError(msg_err) 4927 4928 # Config - BCFTools bin command 4929 bcftools_bin_command = get_bin_command( 4930 bin="bcftools", 4931 tool="bcftools", 4932 bin_type="bin", 4933 config=config, 4934 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 4935 ) 4936 if not bcftools_bin_command: 4937 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 4938 log.error(msg_err) 4939 raise ValueError(msg_err) 4940 4941 # Config - annovar databases 4942 annovar_databases = ( 4943 config.get("folders", {}) 4944 .get("databases", {}) 4945 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 4946 ) 4947 annovar_databases = full_path(annovar_databases) 4948 if annovar_databases != "" and not os.path.exists(annovar_databases): 4949 os.makedirs(annovar_databases) 4950 4951 # Param 4952 param = self.get_param() 4953 log.debug("Param: " + str(param)) 4954 4955 # Param - options 4956 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 4957 log.debug("Options: " + str(options)) 4958 4959 # Param - annotations 4960 annotations = ( 4961 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 4962 ) 4963 log.debug("Annotations: " + str(annotations)) 4964 4965 # Param - Assembly 4966 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4967 4968 # Annovar database assembly 4969 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 4970 if annovar_databases_assembly != "" and not os.path.exists( 4971 annovar_databases_assembly 4972 ): 4973 os.makedirs(annovar_databases_assembly) 4974 4975 # Data 4976 table_variants = self.get_table_variants() 4977 4978 # Check if not empty 4979 log.debug("Check if not empty") 4980 sql_query_chromosomes = ( 4981 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4982 ) 4983 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 4984 if not sql_query_chromosomes_df["count"][0]: 4985 log.info(f"VCF empty") 4986 return 4987 4988 # VCF header 4989 vcf_reader = self.get_header() 4990 log.debug("Initial header: " + str(vcf_reader.infos)) 4991 4992 # Existing annotations 4993 for vcf_annotation in self.get_header().infos: 4994 4995 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4996 log.debug( 4997 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4998 ) 4999 5000 force_update_annotation = True 5001 5002 if annotations: 5003 5004 commands = [] 5005 tmp_annotates_vcf_name_list = [] 5006 5007 # Export in VCF 5008 log.debug("Create initial file to annotate") 5009 tmp_vcf = NamedTemporaryFile( 5010 prefix=self.get_prefix(), 5011 dir=self.get_tmp_dir(), 5012 suffix=".vcf.gz", 5013 delete=False, 5014 ) 5015 tmp_vcf_name = tmp_vcf.name 5016 tmp_files.append(tmp_vcf_name) 5017 tmp_files.append(tmp_vcf_name + ".tbi") 5018 5019 # Export VCF file 5020 self.export_variant_vcf( 5021 vcf_file=tmp_vcf_name, 5022 remove_info=".", 5023 add_samples=False, 5024 index=True, 5025 ) 5026 5027 # Create file for field rename 5028 log.debug("Create file for field rename") 5029 tmp_rename = NamedTemporaryFile( 5030 prefix=self.get_prefix(), 5031 dir=self.get_tmp_dir(), 5032 suffix=".rename", 5033 delete=False, 5034 ) 5035 tmp_rename_name = tmp_rename.name 5036 tmp_files.append(tmp_rename_name) 5037 5038 # Check Annovar database 5039 log.debug( 5040 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5041 ) 5042 databases_download_annovar( 5043 folder=annovar_databases, 5044 files=list(annotations.keys()), 5045 assemblies=[assembly], 5046 ) 5047 5048 for annotation in annotations: 5049 annotation_fields = annotations[annotation] 5050 5051 if not annotation_fields: 5052 annotation_fields = {"INFO": None} 5053 5054 log.info(f"Annotations Annovar - database '{annotation}'") 5055 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5056 5057 # Tmp file for annovar 5058 err_files = [] 5059 tmp_annotate_vcf_directory = TemporaryDirectory( 5060 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5061 ) 5062 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5063 tmp_annotate_vcf_name_annovar = ( 5064 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5065 ) 5066 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5067 err_files.append(tmp_annotate_vcf_name_err) 5068 tmp_files.append(tmp_annotate_vcf_name_err) 5069 5070 # Tmp file final vcf annotated by annovar 5071 tmp_annotate_vcf = NamedTemporaryFile( 5072 prefix=self.get_prefix(), 5073 dir=self.get_tmp_dir(), 5074 suffix=".vcf.gz", 5075 delete=False, 5076 ) 5077 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5078 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5079 tmp_files.append(tmp_annotate_vcf_name) 5080 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5081 5082 # Number of fields 5083 annotation_list = [] 5084 annotation_renamed_list = [] 5085 5086 for annotation_field in annotation_fields: 5087 5088 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5089 annotation_fields_new_name = annotation_fields.get( 5090 annotation_field, annotation_field 5091 ) 5092 if not annotation_fields_new_name: 5093 annotation_fields_new_name = annotation_field 5094 5095 if ( 5096 force_update_annotation 5097 or annotation_fields_new_name not in self.get_header().infos 5098 ): 5099 annotation_list.append(annotation_field) 5100 annotation_renamed_list.append(annotation_fields_new_name) 5101 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5102 log.warning( 5103 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5104 ) 5105 5106 # Add rename info 5107 run_parallel_commands( 5108 [ 5109 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5110 ], 5111 1, 5112 ) 5113 5114 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5115 log.debug("annotation_list: " + str(annotation_list)) 5116 5117 # protocol 5118 protocol = annotation 5119 5120 # argument 5121 argument = "" 5122 5123 # operation 5124 operation = "f" 5125 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5126 "ensGene" 5127 ): 5128 operation = "g" 5129 if options.get("genebase", None): 5130 argument = f"""'{options.get("genebase","")}'""" 5131 elif annotation in ["cytoBand"]: 5132 operation = "r" 5133 5134 # argument option 5135 argument_option = "" 5136 if argument != "": 5137 argument_option = " --argument " + argument 5138 5139 # command options 5140 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5141 for option in options: 5142 if option not in ["genebase"]: 5143 command_options += f""" --{option}={options[option]}""" 5144 5145 # Command 5146 5147 # Command - Annovar 5148 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5149 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5150 5151 # Command - start pipe 5152 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5153 5154 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5155 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5156 5157 # Command - Special characters (refGene annotation) 5158 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5159 5160 # Command - Clean empty fields (with value ".") 5161 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5162 5163 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5164 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5165 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5166 # for ann in annotation_renamed_list: 5167 for ann in annotation_list: 5168 annovar_fields_to_keep.append(f"^INFO/{ann}") 5169 5170 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5171 5172 # Command - indexing 5173 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5174 5175 log.debug(f"Annotation - Annovar command: {command_annovar}") 5176 run_parallel_commands([command_annovar], 1) 5177 5178 # Error messages 5179 log.info(f"Error/Warning messages:") 5180 error_message_command_all = [] 5181 error_message_command_warning = [] 5182 error_message_command_err = [] 5183 for err_file in err_files: 5184 with open(err_file, "r") as f: 5185 for line in f: 5186 message = line.strip() 5187 error_message_command_all.append(message) 5188 if line.startswith("[W::") or line.startswith("WARNING"): 5189 error_message_command_warning.append(message) 5190 if line.startswith("[E::") or line.startswith("ERROR"): 5191 error_message_command_err.append( 5192 f"{err_file}: " + message 5193 ) 5194 # log info 5195 for message in list( 5196 set(error_message_command_err + error_message_command_warning) 5197 ): 5198 log.info(f" {message}") 5199 # debug info 5200 for message in list(set(error_message_command_all)): 5201 log.debug(f" {message}") 5202 # failed 5203 if len(error_message_command_err): 5204 log.error("Annotation failed: Error in commands") 5205 raise ValueError("Annotation failed: Error in commands") 5206 5207 if tmp_annotates_vcf_name_list: 5208 5209 # List of annotated files 5210 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5211 5212 # Tmp file 5213 tmp_annotate_vcf = NamedTemporaryFile( 5214 prefix=self.get_prefix(), 5215 dir=self.get_tmp_dir(), 5216 suffix=".vcf.gz", 5217 delete=False, 5218 ) 5219 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5220 tmp_files.append(tmp_annotate_vcf_name) 5221 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5222 err_files.append(tmp_annotate_vcf_name_err) 5223 tmp_files.append(tmp_annotate_vcf_name_err) 5224 5225 # Command merge 5226 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5227 log.info( 5228 f"Annotation Annovar - Annotation merging " 5229 + str(len(tmp_annotates_vcf_name_list)) 5230 + " annotated files" 5231 ) 5232 log.debug(f"Annotation - merge command: {merge_command}") 5233 run_parallel_commands([merge_command], 1) 5234 5235 # Find annotation in header 5236 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5237 header_list = self.read_vcf_header(f) 5238 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5239 5240 for ann in annovar_vcf_header.infos: 5241 if ann not in self.get_header().infos: 5242 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5243 5244 # Update variants 5245 log.info(f"Annotation Annovar - Updating...") 5246 self.update_from_vcf(tmp_annotate_vcf_name) 5247 5248 # Clean files 5249 # Tmp file remove command 5250 if True: 5251 tmp_files_remove_command = "" 5252 if tmp_files: 5253 tmp_files_remove_command = " ".join(tmp_files) 5254 clean_command = f" rm -f {tmp_files_remove_command} " 5255 log.debug(f"Annotation Annovar - Annotation cleaning ") 5256 log.debug(f"Annotation - cleaning command: {clean_command}") 5257 run_parallel_commands([clean_command], 1) 5258 5259 # Parquet 5260 def annotation_parquet(self, threads: int = None) -> None: 5261 """ 5262 It takes a VCF file, and annotates it with a parquet file 5263 5264 :param threads: number of threads to use for the annotation 5265 :return: the value of the variable "result". 5266 """ 5267 5268 # DEBUG 5269 log.debug("Start annotation with parquet databases") 5270 5271 # Threads 5272 if not threads: 5273 threads = self.get_threads() 5274 log.debug("Threads: " + str(threads)) 5275 5276 # DEBUG 5277 delete_tmp = True 5278 if self.get_config().get("verbosity", "warning") in ["debug"]: 5279 delete_tmp = False 5280 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5281 5282 # Config 5283 databases_folders = set( 5284 self.get_config() 5285 .get("folders", {}) 5286 .get("databases", {}) 5287 .get("annotations", ["."]) 5288 + self.get_config() 5289 .get("folders", {}) 5290 .get("databases", {}) 5291 .get("parquet", ["."]) 5292 ) 5293 log.debug("Databases annotations: " + str(databases_folders)) 5294 5295 # Param 5296 annotations = ( 5297 self.get_param() 5298 .get("annotation", {}) 5299 .get("parquet", {}) 5300 .get("annotations", None) 5301 ) 5302 log.debug("Annotations: " + str(annotations)) 5303 5304 # Assembly 5305 assembly = self.get_param().get( 5306 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5307 ) 5308 5309 # Force Update Annotation 5310 force_update_annotation = ( 5311 self.get_param() 5312 .get("annotation", {}) 5313 .get("options", {}) 5314 .get("annotations_update", False) 5315 ) 5316 log.debug(f"force_update_annotation={force_update_annotation}") 5317 force_append_annotation = ( 5318 self.get_param() 5319 .get("annotation", {}) 5320 .get("options", {}) 5321 .get("annotations_append", False) 5322 ) 5323 log.debug(f"force_append_annotation={force_append_annotation}") 5324 5325 # Data 5326 table_variants = self.get_table_variants() 5327 5328 # Check if not empty 5329 log.debug("Check if not empty") 5330 sql_query_chromosomes_df = self.get_query_to_df( 5331 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5332 ) 5333 if not sql_query_chromosomes_df["count"][0]: 5334 log.info(f"VCF empty") 5335 return 5336 5337 # VCF header 5338 vcf_reader = self.get_header() 5339 log.debug("Initial header: " + str(vcf_reader.infos)) 5340 5341 # Nb Variants POS 5342 log.debug("NB Variants Start") 5343 nb_variants = self.conn.execute( 5344 f"SELECT count(*) AS count FROM variants" 5345 ).fetchdf()["count"][0] 5346 log.debug("NB Variants Stop") 5347 5348 # Existing annotations 5349 for vcf_annotation in self.get_header().infos: 5350 5351 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5352 log.debug( 5353 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5354 ) 5355 5356 # Added columns 5357 added_columns = [] 5358 5359 # drop indexes 5360 log.debug(f"Drop indexes...") 5361 self.drop_indexes() 5362 5363 if annotations: 5364 5365 if "ALL" in annotations: 5366 5367 all_param = annotations.get("ALL", {}) 5368 all_param_formats = all_param.get("formats", None) 5369 all_param_releases = all_param.get("releases", None) 5370 5371 databases_infos_dict = self.scan_databases( 5372 database_formats=all_param_formats, 5373 database_releases=all_param_releases, 5374 ) 5375 for database_infos in databases_infos_dict.keys(): 5376 if database_infos not in annotations: 5377 annotations[database_infos] = {"INFO": None} 5378 5379 for annotation in annotations: 5380 5381 if annotation in ["ALL"]: 5382 continue 5383 5384 # Annotation Name 5385 annotation_name = os.path.basename(annotation) 5386 5387 # Annotation fields 5388 annotation_fields = annotations[annotation] 5389 if not annotation_fields: 5390 annotation_fields = {"INFO": None} 5391 5392 log.debug(f"Annotation '{annotation_name}'") 5393 log.debug( 5394 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5395 ) 5396 5397 # Create Database 5398 database = Database( 5399 database=annotation, 5400 databases_folders=databases_folders, 5401 assembly=assembly, 5402 ) 5403 5404 # Find files 5405 parquet_file = database.get_database() 5406 parquet_hdr_file = database.get_header_file() 5407 parquet_type = database.get_type() 5408 5409 # Check if files exists 5410 if not parquet_file or not parquet_hdr_file: 5411 log.error("Annotation failed: file not found") 5412 raise ValueError("Annotation failed: file not found") 5413 else: 5414 # Get parquet connexion 5415 parquet_sql_attach = database.get_sql_database_attach( 5416 output="query" 5417 ) 5418 if parquet_sql_attach: 5419 self.conn.execute(parquet_sql_attach) 5420 parquet_file_link = database.get_sql_database_link() 5421 # Log 5422 log.debug( 5423 f"Annotation '{annotation_name}' - file: " 5424 + str(parquet_file) 5425 + " and " 5426 + str(parquet_hdr_file) 5427 ) 5428 5429 # Database full header columns 5430 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 5431 parquet_hdr_file 5432 ) 5433 # Log 5434 log.debug( 5435 "Annotation database header columns : " 5436 + str(parquet_hdr_vcf_header_columns) 5437 ) 5438 5439 # Load header as VCF object 5440 parquet_hdr_vcf_header_infos = database.get_header().infos 5441 # Log 5442 log.debug( 5443 "Annotation database header: " 5444 + str(parquet_hdr_vcf_header_infos) 5445 ) 5446 5447 # Get extra infos 5448 parquet_columns = database.get_extra_columns() 5449 # Log 5450 log.debug("Annotation database Columns: " + str(parquet_columns)) 5451 5452 # Add extra columns if "ALL" in annotation_fields 5453 # if "ALL" in annotation_fields: 5454 # allow_add_extra_column = True 5455 if "ALL" in annotation_fields and database.get_extra_columns(): 5456 for extra_column in database.get_extra_columns(): 5457 if ( 5458 extra_column not in annotation_fields 5459 and extra_column.replace("INFO/", "") 5460 not in parquet_hdr_vcf_header_infos 5461 ): 5462 parquet_hdr_vcf_header_infos[extra_column] = ( 5463 vcf.parser._Info( 5464 extra_column, 5465 ".", 5466 "String", 5467 f"{extra_column} description", 5468 "unknown", 5469 "unknown", 5470 self.code_type_map["String"], 5471 ) 5472 ) 5473 5474 # For all fields in database 5475 annotation_fields_all = False 5476 if "ALL" in annotation_fields or "INFO" in annotation_fields: 5477 annotation_fields_all = True 5478 annotation_fields = { 5479 key: key for key in parquet_hdr_vcf_header_infos 5480 } 5481 5482 log.debug( 5483 "Annotation database header - All annotations added: " 5484 + str(annotation_fields) 5485 ) 5486 5487 # Init 5488 5489 # List of annotation fields to use 5490 sql_query_annotation_update_info_sets = [] 5491 5492 # List of annotation to agregate 5493 sql_query_annotation_to_agregate = [] 5494 5495 # Number of fields 5496 nb_annotation_field = 0 5497 5498 # Annotation fields processed 5499 annotation_fields_processed = [] 5500 5501 # Columns mapping 5502 map_columns = database.map_columns( 5503 columns=annotation_fields, prefixes=["INFO/"] 5504 ) 5505 5506 # Query dict for fields to remove (update option) 5507 query_dict_remove = {} 5508 5509 # Fetch Anotation fields 5510 for annotation_field in annotation_fields: 5511 5512 # annotation_field_column 5513 annotation_field_column = map_columns.get( 5514 annotation_field, "INFO" 5515 ) 5516 5517 # field new name, if parametered 5518 annotation_fields_new_name = annotation_fields.get( 5519 annotation_field, annotation_field 5520 ) 5521 if not annotation_fields_new_name: 5522 annotation_fields_new_name = annotation_field 5523 5524 # To annotate 5525 # force_update_annotation = True 5526 # force_append_annotation = True 5527 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 5528 if annotation_field in parquet_hdr_vcf_header_infos and ( 5529 force_update_annotation 5530 or force_append_annotation 5531 or ( 5532 annotation_fields_new_name 5533 not in self.get_header().infos 5534 ) 5535 ): 5536 5537 # Add field to annotation to process list 5538 annotation_fields_processed.append( 5539 annotation_fields_new_name 5540 ) 5541 5542 # explode infos for the field 5543 annotation_fields_new_name_info_msg = "" 5544 if ( 5545 force_update_annotation 5546 and annotation_fields_new_name 5547 in self.get_header().infos 5548 ): 5549 # Remove field from INFO 5550 query = f""" 5551 UPDATE {table_variants} as table_variants 5552 SET INFO = REGEXP_REPLACE( 5553 concat(table_variants.INFO,''), 5554 ';*{annotation_fields_new_name}=[^;]*', 5555 '' 5556 ) 5557 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 5558 """ 5559 annotation_fields_new_name_info_msg = " [update]" 5560 query_dict_remove[ 5561 f"remove 'INFO/{annotation_fields_new_name}'" 5562 ] = query 5563 5564 # Sep between fields in INFO 5565 nb_annotation_field += 1 5566 if nb_annotation_field > 1: 5567 annotation_field_sep = ";" 5568 else: 5569 annotation_field_sep = "" 5570 5571 log.info( 5572 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 5573 ) 5574 5575 # Add INFO field to header 5576 parquet_hdr_vcf_header_infos_number = ( 5577 parquet_hdr_vcf_header_infos[annotation_field].num 5578 or "." 5579 ) 5580 parquet_hdr_vcf_header_infos_type = ( 5581 parquet_hdr_vcf_header_infos[annotation_field].type 5582 or "String" 5583 ) 5584 parquet_hdr_vcf_header_infos_description = ( 5585 parquet_hdr_vcf_header_infos[annotation_field].desc 5586 or f"{annotation_field} description" 5587 ) 5588 parquet_hdr_vcf_header_infos_source = ( 5589 parquet_hdr_vcf_header_infos[annotation_field].source 5590 or "unknown" 5591 ) 5592 parquet_hdr_vcf_header_infos_version = ( 5593 parquet_hdr_vcf_header_infos[annotation_field].version 5594 or "unknown" 5595 ) 5596 5597 vcf_reader.infos[annotation_fields_new_name] = ( 5598 vcf.parser._Info( 5599 annotation_fields_new_name, 5600 parquet_hdr_vcf_header_infos_number, 5601 parquet_hdr_vcf_header_infos_type, 5602 parquet_hdr_vcf_header_infos_description, 5603 parquet_hdr_vcf_header_infos_source, 5604 parquet_hdr_vcf_header_infos_version, 5605 self.code_type_map[ 5606 parquet_hdr_vcf_header_infos_type 5607 ], 5608 ) 5609 ) 5610 5611 # Append 5612 if force_append_annotation: 5613 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 5614 else: 5615 query_case_when_append = "" 5616 5617 # Annotation/Update query fields 5618 # Found in INFO column 5619 if ( 5620 annotation_field_column == "INFO" 5621 and "INFO" in parquet_hdr_vcf_header_columns 5622 ): 5623 sql_query_annotation_update_info_sets.append( 5624 f""" 5625 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 5626 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 5627 ELSE '' 5628 END 5629 """ 5630 ) 5631 # Found in a specific column 5632 else: 5633 sql_query_annotation_update_info_sets.append( 5634 f""" 5635 CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append} 5636 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ',')) 5637 ELSE '' 5638 END 5639 """ 5640 ) 5641 sql_query_annotation_to_agregate.append( 5642 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 5643 ) 5644 5645 # Not to annotate 5646 else: 5647 5648 if force_update_annotation: 5649 annotation_message = "forced" 5650 else: 5651 annotation_message = "skipped" 5652 5653 if annotation_field not in parquet_hdr_vcf_header_infos: 5654 log.warning( 5655 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 5656 ) 5657 if annotation_fields_new_name in self.get_header().infos: 5658 log.warning( 5659 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 5660 ) 5661 5662 # Check if ALL fields have to be annotated. Thus concat all INFO field 5663 # allow_annotation_full_info = True 5664 allow_annotation_full_info = not force_append_annotation 5665 5666 if parquet_type in ["regions"]: 5667 allow_annotation_full_info = False 5668 5669 if ( 5670 allow_annotation_full_info 5671 and nb_annotation_field == len(annotation_fields) 5672 and annotation_fields_all 5673 and ( 5674 "INFO" in parquet_hdr_vcf_header_columns 5675 and "INFO" in database.get_extra_columns() 5676 ) 5677 ): 5678 log.debug("Column INFO annotation enabled") 5679 sql_query_annotation_update_info_sets = [] 5680 sql_query_annotation_update_info_sets.append( 5681 f" table_parquet.INFO " 5682 ) 5683 5684 if sql_query_annotation_update_info_sets: 5685 5686 # Annotate 5687 log.info(f"Annotation '{annotation_name}' - Annotation...") 5688 5689 # Join query annotation update info sets for SQL 5690 sql_query_annotation_update_info_sets_sql = ",".join( 5691 sql_query_annotation_update_info_sets 5692 ) 5693 5694 # Check chromosomes list (and variants infos) 5695 sql_query_chromosomes = f""" 5696 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 5697 FROM {table_variants} as table_variants 5698 GROUP BY table_variants."#CHROM" 5699 ORDER BY table_variants."#CHROM" 5700 """ 5701 sql_query_chromosomes_df = self.conn.execute( 5702 sql_query_chromosomes 5703 ).df() 5704 sql_query_chromosomes_dict = { 5705 entry["CHROM"]: { 5706 "count": entry["count_variants"], 5707 "min": entry["min_variants"], 5708 "max": entry["max_variants"], 5709 } 5710 for index, entry in sql_query_chromosomes_df.iterrows() 5711 } 5712 5713 # Init 5714 nb_of_query = 0 5715 nb_of_variant_annotated = 0 5716 query_dict = query_dict_remove 5717 5718 # for chrom in sql_query_chromosomes_df["CHROM"]: 5719 for chrom in sql_query_chromosomes_dict: 5720 5721 # Number of variant by chromosome 5722 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 5723 chrom, {} 5724 ).get("count", 0) 5725 5726 log.debug( 5727 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 5728 ) 5729 5730 # Annotation with regions database 5731 if parquet_type in ["regions"]: 5732 sql_query_annotation_from_clause = f""" 5733 FROM ( 5734 SELECT 5735 '{chrom}' AS \"#CHROM\", 5736 table_variants_from.\"POS\" AS \"POS\", 5737 {",".join(sql_query_annotation_to_agregate)} 5738 FROM {table_variants} as table_variants_from 5739 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 5740 table_parquet_from."#CHROM" = '{chrom}' 5741 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 5742 AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1) 5743 OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 5744 ) 5745 ) 5746 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 5747 GROUP BY table_variants_from.\"POS\" 5748 ) 5749 as table_parquet 5750 """ 5751 5752 sql_query_annotation_where_clause = """ 5753 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5754 AND table_parquet.\"POS\" = table_variants.\"POS\" 5755 """ 5756 5757 # Annotation with variants database 5758 else: 5759 sql_query_annotation_from_clause = f""" 5760 FROM {parquet_file_link} as table_parquet 5761 """ 5762 sql_query_annotation_where_clause = f""" 5763 table_variants."#CHROM" = '{chrom}' 5764 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5765 AND table_parquet.\"POS\" = table_variants.\"POS\" 5766 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5767 AND table_parquet.\"REF\" = table_variants.\"REF\" 5768 """ 5769 5770 # Create update query 5771 sql_query_annotation_chrom_interval_pos = f""" 5772 UPDATE {table_variants} as table_variants 5773 SET INFO = 5774 concat( 5775 CASE WHEN table_variants.INFO NOT IN ('','.') 5776 THEN table_variants.INFO 5777 ELSE '' 5778 END 5779 , 5780 CASE WHEN table_variants.INFO NOT IN ('','.') 5781 AND ( 5782 concat({sql_query_annotation_update_info_sets_sql}) 5783 ) 5784 NOT IN ('','.') 5785 THEN ';' 5786 ELSE '' 5787 END 5788 , 5789 {sql_query_annotation_update_info_sets_sql} 5790 ) 5791 {sql_query_annotation_from_clause} 5792 WHERE {sql_query_annotation_where_clause} 5793 ; 5794 """ 5795 5796 # Add update query to dict 5797 query_dict[ 5798 f"{chrom} [{nb_of_variant_by_chrom} variants]" 5799 ] = sql_query_annotation_chrom_interval_pos 5800 5801 nb_of_query = len(query_dict) 5802 num_query = 0 5803 5804 # SET max_expression_depth TO x 5805 self.conn.execute("SET max_expression_depth TO 10000") 5806 5807 for query_name in query_dict: 5808 query = query_dict[query_name] 5809 num_query += 1 5810 log.info( 5811 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 5812 ) 5813 result = self.conn.execute(query) 5814 nb_of_variant_annotated_by_query = result.df()["Count"][0] 5815 nb_of_variant_annotated += nb_of_variant_annotated_by_query 5816 log.info( 5817 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 5818 ) 5819 5820 log.info( 5821 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 5822 ) 5823 5824 else: 5825 5826 log.info( 5827 f"Annotation '{annotation_name}' - No Annotations available" 5828 ) 5829 5830 log.debug("Final header: " + str(vcf_reader.infos)) 5831 5832 # Remove added columns 5833 for added_column in added_columns: 5834 self.drop_column(column=added_column) 5835 5836 def annotation_splice(self, threads: int = None) -> None: 5837 """ 5838 This function annotate with snpEff 5839 5840 :param threads: The number of threads to use 5841 :return: the value of the variable "return_value". 5842 """ 5843 5844 # DEBUG 5845 log.debug("Start annotation with splice tools") 5846 5847 # Threads 5848 if not threads: 5849 threads = self.get_threads() 5850 log.debug("Threads: " + str(threads)) 5851 5852 # DEBUG 5853 delete_tmp = True 5854 if self.get_config().get("verbosity", "warning") in ["debug"]: 5855 delete_tmp = False 5856 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5857 5858 # Config 5859 config = self.get_config() 5860 log.debug("Config: " + str(config)) 5861 splice_config = config.get("tools", {}).get("splice", {}) 5862 if not splice_config: 5863 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 5864 if not splice_config: 5865 msg_err = "No Splice tool config" 5866 log.error(msg_err) 5867 raise ValueError(msg_err) 5868 log.debug(f"splice_config={splice_config}") 5869 5870 # Config - Folders - Databases 5871 databases_folders = ( 5872 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 5873 ) 5874 log.debug("Databases annotations: " + str(databases_folders)) 5875 5876 # Splice docker image 5877 splice_docker_image = splice_config.get("docker").get("image") 5878 5879 # Pull splice image if it's not already there 5880 if not check_docker_image_exists(splice_docker_image): 5881 log.warning( 5882 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 5883 ) 5884 try: 5885 command(f"docker pull {splice_config.get('docker').get('image')}") 5886 except subprocess.CalledProcessError: 5887 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 5888 log.error(msg_err) 5889 raise ValueError(msg_err) 5890 return None 5891 5892 # Config - splice databases 5893 splice_databases = ( 5894 config.get("folders", {}) 5895 .get("databases", {}) 5896 .get("splice", DEFAULT_SPLICE_FOLDER) 5897 ) 5898 splice_databases = full_path(splice_databases) 5899 5900 # Param 5901 param = self.get_param() 5902 log.debug("Param: " + str(param)) 5903 5904 # Param 5905 options = param.get("annotation", {}).get("splice", {}) 5906 log.debug("Options: " + str(options)) 5907 5908 # Data 5909 table_variants = self.get_table_variants() 5910 5911 # Check if not empty 5912 log.debug("Check if not empty") 5913 sql_query_chromosomes = ( 5914 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5915 ) 5916 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5917 log.info("VCF empty") 5918 return None 5919 5920 # Export in VCF 5921 log.debug("Create initial file to annotate") 5922 5923 # Create output folder 5924 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 5925 if not os.path.exists(output_folder): 5926 Path(output_folder).mkdir(parents=True, exist_ok=True) 5927 5928 # Create tmp VCF file 5929 tmp_vcf = NamedTemporaryFile( 5930 prefix=self.get_prefix(), 5931 dir=output_folder, 5932 suffix=".vcf", 5933 delete=False, 5934 ) 5935 tmp_vcf_name = tmp_vcf.name 5936 5937 # VCF header 5938 header = self.get_header() 5939 5940 # Existing annotations 5941 for vcf_annotation in self.get_header().infos: 5942 5943 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5944 log.debug( 5945 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5946 ) 5947 5948 # Memory limit 5949 if config.get("memory", None): 5950 memory_limit = config.get("memory", "8G").upper() 5951 # upper() 5952 else: 5953 memory_limit = "8G" 5954 log.debug(f"memory_limit: {memory_limit}") 5955 5956 # Export VCF file 5957 self.export_variant_vcf( 5958 vcf_file=tmp_vcf_name, 5959 remove_info=True, 5960 add_samples=True, 5961 index=False, 5962 ) 5963 5964 # Create docker container and launch splice analysis 5965 if splice_config: 5966 5967 # Splice mount folders 5968 mount_folders = splice_config.get("mount", {}) 5969 5970 # Genome mount 5971 mount_folders[ 5972 config.get("folders", {}) 5973 .get("databases", {}) 5974 .get("genomes", DEFAULT_GENOME_FOLDER) 5975 ] = "ro" 5976 5977 # SpliceAI mount 5978 mount_folders[ 5979 config.get("folders", {}) 5980 .get("databases", {}) 5981 .get("spliceai", DEFAULT_SPLICEAI_FOLDER) 5982 ] = "ro" 5983 5984 # Genome mount 5985 mount_folders[ 5986 config.get("folders", {}) 5987 .get("databases", {}) 5988 .get("spip", DEFAULT_SPIP_FOLDER) 5989 ] = "ro" 5990 5991 # Mount folders 5992 mount = [] 5993 5994 # Config mount 5995 mount = [ 5996 f"-v {full_path(path)}:{full_path(path)}:{mode}" 5997 for path, mode in mount_folders.items() 5998 ] 5999 6000 if any(value for value in splice_config.values() if value is None): 6001 log.warning("At least one splice config parameter is empty") 6002 return None 6003 6004 # Params in splice nf 6005 def check_values(dico: dict): 6006 """ 6007 Ensure parameters for NF splice pipeline 6008 """ 6009 for key, val in dico.items(): 6010 if key == "genome": 6011 if any( 6012 assemb in options.get("genome", {}) 6013 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6014 ): 6015 yield f"--{key} hg19" 6016 elif any( 6017 assemb in options.get("genome", {}) 6018 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6019 ): 6020 yield f"--{key} hg38" 6021 elif ( 6022 (isinstance(val, str) and val) 6023 or isinstance(val, int) 6024 or isinstance(val, bool) 6025 ): 6026 yield f"--{key} {val}" 6027 6028 # Genome 6029 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6030 options["genome"] = genome 6031 6032 # NF params 6033 nf_params = [] 6034 6035 # Add options 6036 if options: 6037 nf_params = list(check_values(options)) 6038 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6039 else: 6040 log.debug("No NF params provided") 6041 6042 # Add threads 6043 if "threads" not in options.keys(): 6044 nf_params.append(f"--threads {threads}") 6045 6046 # Genome path 6047 genome_path = find_genome( 6048 config.get("folders", {}) 6049 .get("databases", {}) 6050 .get("genomes", DEFAULT_GENOME_FOLDER), 6051 file=f"{genome}.fa", 6052 ) 6053 # Add genome path 6054 if not genome_path: 6055 raise ValueError( 6056 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6057 ) 6058 else: 6059 log.debug(f"Genome: {genome_path}") 6060 nf_params.append(f"--genome_path {genome_path}") 6061 6062 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6063 """ 6064 Setting up updated databases for SPiP and SpliceAI 6065 """ 6066 6067 try: 6068 6069 # SpliceAI assembly transcriptome 6070 spliceai_assembly = os.path.join( 6071 config.get("folders", {}) 6072 .get("databases", {}) 6073 .get("spliceai", {}), 6074 options.get("genome"), 6075 "transcriptome", 6076 ) 6077 spip_assembly = options.get("genome") 6078 6079 spip = find( 6080 f"transcriptome_{spip_assembly}.RData", 6081 config.get("folders", {}).get("databases", {}).get("spip", {}), 6082 ) 6083 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6084 log.debug(f"SPiP annotations: {spip}") 6085 log.debug(f"SpliceAI annotations: {spliceai}") 6086 if spip and spliceai: 6087 return [ 6088 f"--spip_transcriptome {spip}", 6089 f"--spliceai_annotations {spliceai}", 6090 ] 6091 else: 6092 # TODO crash and go on with basic annotations ? 6093 # raise ValueError( 6094 # "Can't find splice databases in configuration EXIT" 6095 # ) 6096 log.warning( 6097 "Can't find splice databases in configuration, use annotations file from image" 6098 ) 6099 except TypeError: 6100 log.warning( 6101 "Can't find splice databases in configuration, use annotations file from image" 6102 ) 6103 return [] 6104 6105 # Add options, check if transcriptome option have already beend provided 6106 if ( 6107 "spip_transcriptome" not in nf_params 6108 and "spliceai_transcriptome" not in nf_params 6109 ): 6110 splice_reference = splice_annotations(options, config) 6111 if splice_reference: 6112 nf_params.extend(splice_reference) 6113 6114 nf_params.append(f"--output_folder {output_folder}") 6115 6116 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6117 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6118 log.debug(cmd) 6119 6120 splice_config["docker"]["command"] = cmd 6121 6122 docker_cmd = get_bin_command( 6123 tool="splice", 6124 bin_type="docker", 6125 config=config, 6126 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6127 add_options=f"--name {random_uuid} {' '.join(mount)}", 6128 ) 6129 6130 # Docker debug 6131 # if splice_config.get("rm_container"): 6132 # rm_container = "--rm" 6133 # else: 6134 # rm_container = "" 6135 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6136 6137 log.debug(docker_cmd) 6138 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6139 log.debug(res.stdout) 6140 if res.stderr: 6141 log.error(res.stderr) 6142 res.check_returncode() 6143 else: 6144 log.warning(f"Splice tool configuration not found: {config}") 6145 6146 # Update variants 6147 log.info("Annotation - Updating...") 6148 # Test find output vcf 6149 log.debug( 6150 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6151 ) 6152 output_vcf = [] 6153 # Wrong folder to look in 6154 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6155 if ( 6156 files 6157 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6158 ): 6159 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6160 # log.debug(os.listdir(options.get("output_folder"))) 6161 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6162 if not output_vcf: 6163 log.debug( 6164 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6165 ) 6166 else: 6167 # Get new header from annotated vcf 6168 log.debug(f"Initial header: {len(header.infos)} fields") 6169 # Create new header with splice infos 6170 new_vcf = Variants(input=output_vcf[0]) 6171 new_vcf_header = new_vcf.get_header().infos 6172 for keys, infos in new_vcf_header.items(): 6173 if keys not in header.infos.keys(): 6174 header.infos[keys] = infos 6175 log.debug(f"New header: {len(header.infos)} fields") 6176 log.debug(f"Splice tmp output: {output_vcf[0]}") 6177 self.update_from_vcf(output_vcf[0]) 6178 6179 # Remove folder 6180 remove_if_exists(output_folder) 6181 6182 ### 6183 # Prioritization 6184 ### 6185 6186 def get_config_default(self, name: str) -> dict: 6187 """ 6188 The function `get_config_default` returns a dictionary containing default configurations for 6189 various calculations and prioritizations. 6190 6191 :param name: The `get_config_default` function returns a dictionary containing default 6192 configurations for different calculations and prioritizations. The `name` parameter is used to 6193 specify which specific configuration to retrieve from the dictionary 6194 :type name: str 6195 :return: The function `get_config_default` returns a dictionary containing default configuration 6196 settings for different calculations and prioritizations. The specific configuration settings are 6197 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6198 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6199 returned. If there is no match, an empty dictionary is returned. 6200 """ 6201 6202 config_default = { 6203 "calculations": { 6204 "variant_chr_pos_alt_ref": { 6205 "type": "sql", 6206 "name": "variant_chr_pos_alt_ref", 6207 "description": "Create a variant ID with chromosome, position, alt and ref", 6208 "available": False, 6209 "output_column_name": "variant_chr_pos_alt_ref", 6210 "output_column_type": "String", 6211 "output_column_description": "variant ID with chromosome, position, alt and ref", 6212 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6213 "operation_info": True, 6214 }, 6215 "VARTYPE": { 6216 "type": "sql", 6217 "name": "VARTYPE", 6218 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6219 "available": True, 6220 "output_column_name": "VARTYPE", 6221 "output_column_type": "String", 6222 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6223 "operation_query": """ 6224 CASE 6225 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6226 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6227 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6228 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6229 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6230 ELSE 'UNDEFINED' 6231 END 6232 """, 6233 "info_fields": ["SVTYPE"], 6234 "operation_info": True, 6235 }, 6236 "snpeff_hgvs": { 6237 "type": "python", 6238 "name": "snpeff_hgvs", 6239 "description": "HGVS nomenclatures from snpEff annotation", 6240 "available": True, 6241 "function_name": "calculation_extract_snpeff_hgvs", 6242 "function_params": [], 6243 }, 6244 "NOMEN": { 6245 "type": "python", 6246 "name": "NOMEN", 6247 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field", 6248 "available": True, 6249 "function_name": "calculation_extract_nomen", 6250 "function_params": [], 6251 }, 6252 "FINDBYPIPELINE": { 6253 "type": "python", 6254 "name": "FINDBYPIPELINE", 6255 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6256 "available": True, 6257 "function_name": "calculation_find_by_pipeline", 6258 "function_params": ["findbypipeline"], 6259 }, 6260 "FINDBYSAMPLE": { 6261 "type": "python", 6262 "name": "FINDBYSAMPLE", 6263 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6264 "available": True, 6265 "function_name": "calculation_find_by_pipeline", 6266 "function_params": ["findbysample"], 6267 }, 6268 "GENOTYPECONCORDANCE": { 6269 "type": "python", 6270 "name": "GENOTYPECONCORDANCE", 6271 "description": "Concordance of genotype for multi caller VCF", 6272 "available": True, 6273 "function_name": "calculation_genotype_concordance", 6274 "function_params": [], 6275 }, 6276 "BARCODE": { 6277 "type": "python", 6278 "name": "BARCODE", 6279 "description": "BARCODE as VaRank tool", 6280 "available": True, 6281 "function_name": "calculation_barcode", 6282 "function_params": [], 6283 }, 6284 "BARCODEFAMILY": { 6285 "type": "python", 6286 "name": "BARCODEFAMILY", 6287 "description": "BARCODEFAMILY as VaRank tool", 6288 "available": True, 6289 "function_name": "calculation_barcode_family", 6290 "function_params": ["BCF"], 6291 }, 6292 "TRIO": { 6293 "type": "python", 6294 "name": "TRIO", 6295 "description": "Inheritance for a trio family", 6296 "available": True, 6297 "function_name": "calculation_trio", 6298 "function_params": [], 6299 }, 6300 "VAF": { 6301 "type": "python", 6302 "name": "VAF", 6303 "description": "Variant Allele Frequency (VAF) harmonization", 6304 "available": True, 6305 "function_name": "calculation_vaf_normalization", 6306 "function_params": [], 6307 }, 6308 "VAF_stats": { 6309 "type": "python", 6310 "name": "VAF_stats", 6311 "description": "Variant Allele Frequency (VAF) statistics", 6312 "available": True, 6313 "function_name": "calculation_genotype_stats", 6314 "function_params": ["VAF"], 6315 }, 6316 "DP_stats": { 6317 "type": "python", 6318 "name": "DP_stats", 6319 "description": "Depth (DP) statistics", 6320 "available": True, 6321 "function_name": "calculation_genotype_stats", 6322 "function_params": ["DP"], 6323 }, 6324 "variant_id": { 6325 "type": "python", 6326 "name": "variant_id", 6327 "description": "Variant ID generated from variant position and type", 6328 "available": True, 6329 "function_name": "calculation_variant_id", 6330 "function_params": [], 6331 }, 6332 }, 6333 "prioritizations": { 6334 "default": { 6335 "filter": [ 6336 { 6337 "type": "notequals", 6338 "value": "!PASS|\\.", 6339 "score": 0, 6340 "flag": "FILTERED", 6341 "comment": ["Bad variant quality"], 6342 }, 6343 { 6344 "type": "equals", 6345 "value": "REJECT", 6346 "score": -20, 6347 "flag": "PASS", 6348 "comment": ["Bad variant quality"], 6349 }, 6350 ], 6351 "DP": [ 6352 { 6353 "type": "gte", 6354 "value": "50", 6355 "score": 5, 6356 "flag": "PASS", 6357 "comment": ["DP higher than 50"], 6358 } 6359 ], 6360 "ANN": [ 6361 { 6362 "type": "contains", 6363 "value": "HIGH", 6364 "score": 5, 6365 "flag": "PASS", 6366 "comment": [ 6367 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6368 ], 6369 }, 6370 { 6371 "type": "contains", 6372 "value": "MODERATE", 6373 "score": 3, 6374 "flag": "PASS", 6375 "comment": [ 6376 "A non-disruptive variant that might change protein effectiveness" 6377 ], 6378 }, 6379 { 6380 "type": "contains", 6381 "value": "LOW", 6382 "score": 0, 6383 "flag": "FILTERED", 6384 "comment": [ 6385 "Assumed to be mostly harmless or unlikely to change protein behavior" 6386 ], 6387 }, 6388 { 6389 "type": "contains", 6390 "value": "MODIFIER", 6391 "score": 0, 6392 "flag": "FILTERED", 6393 "comment": [ 6394 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 6395 ], 6396 }, 6397 ], 6398 } 6399 }, 6400 } 6401 6402 return config_default.get(name, None) 6403 6404 def get_config_json( 6405 self, name: str, config_dict: dict = {}, config_file: str = None 6406 ) -> dict: 6407 """ 6408 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 6409 default values, a dictionary, and a file. 6410 6411 :param name: The `name` parameter in the `get_config_json` function is a string that represents 6412 the name of the configuration. It is used to identify and retrieve the configuration settings 6413 for a specific component or module 6414 :type name: str 6415 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 6416 dictionary that allows you to provide additional configuration settings or overrides. When you 6417 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 6418 the key is the configuration setting you want to override or 6419 :type config_dict: dict 6420 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 6421 specify the path to a configuration file that contains additional settings. If provided, the 6422 function will read the contents of this file and update the configuration dictionary with the 6423 values found in the file, overriding any existing values with the 6424 :type config_file: str 6425 :return: The function `get_config_json` returns a dictionary containing the configuration 6426 settings. 6427 """ 6428 6429 # Create with default prioritizations 6430 config_default = self.get_config_default(name=name) 6431 configuration = config_default 6432 # log.debug(f"configuration={configuration}") 6433 6434 # Replace prioritizations from dict 6435 for config in config_dict: 6436 configuration[config] = config_dict[config] 6437 6438 # Replace prioritizations from file 6439 config_file = full_path(config_file) 6440 if config_file: 6441 if os.path.exists(config_file): 6442 with open(config_file) as config_file_content: 6443 config_file_dict = json.load(config_file_content) 6444 for config in config_file_dict: 6445 configuration[config] = config_file_dict[config] 6446 else: 6447 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 6448 log.error(msg_error) 6449 raise ValueError(msg_error) 6450 6451 return configuration 6452 6453 # def get_prioritizations_config(self, prioritizations_config_dict:dict = {}, prioritizations_config_file:str = None) -> dict: 6454 6455 # # Create with default prioritizations 6456 # prioritizations_config = self.get_config_default("prioritization") 6457 6458 # # Replace prioritizations from dict 6459 # for prioritization_config in prioritizations_config_dict: 6460 # prioritizations_config[prioritization_config] = prioritizations_config_dict[prioritization_config] 6461 6462 # # Replace prioritizations from file 6463 # prioritizations_config_file = full_path(prioritizations_config_file) 6464 # if prioritizations_config_file: 6465 # if os.path.exists(prioritizations_config_file): 6466 # with open(prioritizations_config_file) as prioritizations_config_file_content: 6467 # prioritizations_config_file_dict = json.load(prioritizations_config_file_content) 6468 # for prioritization_config in prioritizations_config_file_dict: 6469 # prioritizations_config[prioritization_config] = prioritizations_config_file_dict[prioritization_config] 6470 # else: 6471 # log.error(f"Prioritizations config file '{prioritizations_config_file}' does NOT exist") 6472 # raise ValueError(f"Prioritizations config file '{prioritizations_config_file}' does NOT exist") 6473 6474 # return prioritizations_config 6475 6476 def prioritization(self) -> None: 6477 """ 6478 It takes a VCF file, and adds a bunch of new INFO fields to it, based on the values of other 6479 INFO fields 6480 """ 6481 6482 # Config 6483 config = self.get_config() 6484 6485 # Param 6486 param = self.get_param() 6487 6488 # Quick Prioritizations 6489 # prioritizations = param.get("prioritization", {}).get("prioritizations", "") 6490 6491 # Configuration profiles 6492 prioritization_config_file = param.get("prioritization", {}).get( 6493 "prioritization_config", None 6494 ) 6495 prioritization_config_file = full_path(prioritization_config_file) 6496 prioritizations_config = self.get_config_json( 6497 name="prioritizations", config_file=prioritization_config_file 6498 ) 6499 6500 # Prioritization options 6501 profiles = param.get("prioritization", {}).get("profiles", []) 6502 if isinstance(profiles, str): 6503 profiles = profiles.split(",") 6504 pzfields = param.get("prioritization", {}).get( 6505 "pzfields", ["PZFlag", "PZScore"] 6506 ) 6507 if isinstance(pzfields, str): 6508 pzfields = pzfields.split(",") 6509 default_profile = param.get("prioritization", {}).get("default_profile", None) 6510 pzfields_sep = param.get("prioritization", {}).get("pzfields_sep", "_") 6511 prioritization_score_mode = param.get("prioritization", {}).get( 6512 "prioritization_score_mode", "HOWARD" 6513 ) 6514 6515 # Quick Prioritizations 6516 # prioritizations = param.get("prioritization", {}).get("prioritizations", None) 6517 prioritizations = param.get("prioritizations", None) 6518 if prioritizations: 6519 log.info("Quick Prioritization:") 6520 for profile in prioritizations.split(","): 6521 if profile not in profiles: 6522 profiles.append(profile) 6523 log.info(f" {profile}") 6524 6525 # If profile "ALL" provided, all profiles in the config profiles 6526 if "ALL" in profiles: 6527 profiles = list(prioritizations_config.keys()) 6528 6529 for profile in profiles: 6530 if prioritizations_config.get(profile, None): 6531 log.debug(f"Profile '{profile}' configured") 6532 else: 6533 msg_error = f"Profile '{profile}' NOT configured" 6534 log.error(msg_error) 6535 raise ValueError(msg_error) 6536 6537 if profiles: 6538 log.info(f"Prioritization... ") 6539 else: 6540 log.debug(f"No profile defined") 6541 return 6542 6543 if not default_profile and len(profiles): 6544 default_profile = profiles[0] 6545 6546 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 6547 log.debug("Profiles to check: " + str(list(profiles))) 6548 6549 # Variables 6550 table_variants = self.get_table_variants(clause="update") 6551 6552 # Added columns 6553 added_columns = [] 6554 6555 # Create list of PZfields 6556 # List of PZFields 6557 list_of_pzfields_original = pzfields + [ 6558 pzfield + pzfields_sep + profile 6559 for pzfield in pzfields 6560 for profile in profiles 6561 ] 6562 list_of_pzfields = [] 6563 log.debug(f"{list_of_pzfields_original}") 6564 6565 # Remove existing PZfields to use if exists 6566 for pzfield in list_of_pzfields_original: 6567 if self.get_header().infos.get(pzfield, None) is None: 6568 list_of_pzfields.append(pzfield) 6569 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 6570 else: 6571 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 6572 6573 if list_of_pzfields: 6574 6575 # Explode Infos fields 6576 explode_infos_prefix = self.get_explode_infos_prefix() 6577 added_columns += self.explode_infos(prefix=explode_infos_prefix) 6578 extra_infos = self.get_extra_infos() 6579 6580 # PZfields tags description 6581 PZfields_INFOS = { 6582 "PZTags": { 6583 "ID": "PZTags", 6584 "Number": ".", 6585 "Type": "String", 6586 "Description": "Variant tags based on annotation criteria", 6587 }, 6588 "PZScore": { 6589 "ID": "PZScore", 6590 "Number": 1, 6591 "Type": "Integer", 6592 "Description": "Variant score based on annotation criteria", 6593 }, 6594 "PZFlag": { 6595 "ID": "PZFlag", 6596 "Number": 1, 6597 "Type": "String", 6598 "Description": "Variant flag based on annotation criteria", 6599 }, 6600 "PZComment": { 6601 "ID": "PZComment", 6602 "Number": ".", 6603 "Type": "String", 6604 "Description": "Variant comment based on annotation criteria", 6605 }, 6606 "PZInfos": { 6607 "ID": "PZInfos", 6608 "Number": ".", 6609 "Type": "String", 6610 "Description": "Variant infos based on annotation criteria", 6611 }, 6612 } 6613 6614 # Create INFO fields if not exist 6615 for field in PZfields_INFOS: 6616 field_ID = PZfields_INFOS[field]["ID"] 6617 field_description = PZfields_INFOS[field]["Description"] 6618 if field_ID not in self.get_header().infos and field_ID in pzfields: 6619 field_description = ( 6620 PZfields_INFOS[field]["Description"] 6621 + f", profile {default_profile}" 6622 ) 6623 self.get_header().infos[field_ID] = vcf.parser._Info( 6624 field_ID, 6625 PZfields_INFOS[field]["Number"], 6626 PZfields_INFOS[field]["Type"], 6627 field_description, 6628 "unknown", 6629 "unknown", 6630 code_type_map[PZfields_INFOS[field]["Type"]], 6631 ) 6632 6633 # Create INFO fields if not exist for each profile 6634 for profile in prioritizations_config: 6635 if profile in profiles or profiles == []: 6636 for field in PZfields_INFOS: 6637 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 6638 field_description = ( 6639 PZfields_INFOS[field]["Description"] 6640 + f", profile {profile}" 6641 ) 6642 if ( 6643 field_ID not in self.get_header().infos 6644 and field in pzfields 6645 ): 6646 self.get_header().infos[field_ID] = vcf.parser._Info( 6647 field_ID, 6648 PZfields_INFOS[field]["Number"], 6649 PZfields_INFOS[field]["Type"], 6650 field_description, 6651 "unknown", 6652 "unknown", 6653 code_type_map[PZfields_INFOS[field]["Type"]], 6654 ) 6655 6656 # Header 6657 for pzfield in list_of_pzfields: 6658 if re.match("PZScore.*", pzfield): 6659 added_column = self.add_column( 6660 table_name=table_variants, 6661 column_name=pzfield, 6662 column_type="INTEGER", 6663 default_value="0", 6664 ) 6665 elif re.match("PZFlag.*", pzfield): 6666 added_column = self.add_column( 6667 table_name=table_variants, 6668 column_name=pzfield, 6669 column_type="BOOLEAN", 6670 default_value="1", 6671 ) 6672 else: 6673 added_column = self.add_column( 6674 table_name=table_variants, 6675 column_name=pzfield, 6676 column_type="STRING", 6677 default_value="''", 6678 ) 6679 added_columns.append(added_column) 6680 6681 # Profiles 6682 if profiles: 6683 6684 # foreach profile in configuration file 6685 for profile in prioritizations_config: 6686 6687 # If profile is asked in param, or ALL are asked (empty profile []) 6688 if profile in profiles or profiles == []: 6689 log.info(f"Profile '{profile}'") 6690 6691 sql_set_info_option = "" 6692 6693 sql_set_info = [] 6694 6695 # PZ fields set 6696 6697 # PZScore 6698 if f"PZScore{pzfields_sep}{profile}" in list_of_pzfields: 6699 sql_set_info.append( 6700 f""" 6701 concat( 6702 'PZScore{pzfields_sep}{profile}=', 6703 PZScore{pzfields_sep}{profile} 6704 ) 6705 """ 6706 ) 6707 if ( 6708 profile == default_profile 6709 and "PZScore" in list_of_pzfields 6710 ): 6711 sql_set_info.append( 6712 f""" 6713 concat( 6714 'PZScore=', 6715 PZScore{pzfields_sep}{profile} 6716 ) 6717 """ 6718 ) 6719 6720 # PZFlag 6721 if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields: 6722 sql_set_info.append( 6723 f""" 6724 concat( 6725 'PZFlag{pzfields_sep}{profile}=', 6726 CASE 6727 WHEN PZFlag{pzfields_sep}{profile}==1 6728 THEN 'PASS' 6729 WHEN PZFlag{pzfields_sep}{profile}==0 6730 THEN 'FILTERED' 6731 END 6732 ) 6733 """ 6734 ) 6735 if ( 6736 profile == default_profile 6737 and "PZFlag" in list_of_pzfields 6738 ): 6739 sql_set_info.append( 6740 f""" 6741 concat( 6742 'PZFlag=', 6743 CASE 6744 WHEN PZFlag{pzfields_sep}{profile}==1 6745 THEN 'PASS' 6746 WHEN PZFlag{pzfields_sep}{profile}==0 6747 THEN 'FILTERED' 6748 END 6749 ) 6750 """ 6751 ) 6752 6753 # PZComment 6754 if f"PZComment{pzfields_sep}{profile}" in list_of_pzfields: 6755 sql_set_info.append( 6756 f""" 6757 CASE 6758 WHEN PZComment{pzfields_sep}{profile} NOT IN ('') 6759 THEN concat('PZComment{pzfields_sep}{profile}=', PZComment{pzfields_sep}{profile}) 6760 ELSE '' 6761 END 6762 """ 6763 ) 6764 if ( 6765 profile == default_profile 6766 and "PZComment" in list_of_pzfields 6767 ): 6768 sql_set_info.append( 6769 f""" 6770 CASE 6771 WHEN PZComment{pzfields_sep}{profile} NOT IN ('') 6772 THEN concat('PZComment=', PZComment{pzfields_sep}{profile}) 6773 ELSE '' 6774 END 6775 """ 6776 ) 6777 6778 # PZInfos 6779 if f"PZInfos{pzfields_sep}{profile}" in list_of_pzfields: 6780 sql_set_info.append( 6781 f""" 6782 CASE 6783 WHEN PZInfos{pzfields_sep}{profile} NOT IN ('') 6784 THEN concat('PZInfos{pzfields_sep}{profile}=', PZInfos{pzfields_sep}{profile}) 6785 ELSE '' 6786 END 6787 """ 6788 ) 6789 if ( 6790 profile == default_profile 6791 and "PZInfos" in list_of_pzfields 6792 ): 6793 sql_set_info.append( 6794 f""" 6795 CASE 6796 WHEN PZInfos{pzfields_sep}{profile} NOT IN ('') 6797 THEN concat('PZInfos=', PZInfos{pzfields_sep}{profile}) 6798 ELSE '' 6799 END 6800 """ 6801 ) 6802 6803 # Merge PZfields 6804 sql_set_info_option = "" 6805 sql_set_sep = "" 6806 for sql_set in sql_set_info: 6807 if sql_set_sep: 6808 sql_set_info_option += f""" 6809 , concat('{sql_set_sep}', {sql_set}) 6810 """ 6811 else: 6812 sql_set_info_option += f""" 6813 , {sql_set} 6814 """ 6815 sql_set_sep = ";" 6816 6817 sql_queries = [] 6818 for annotation in prioritizations_config[profile]: 6819 6820 # Check if annotation field is present 6821 if not f"{explode_infos_prefix}{annotation}" in extra_infos: 6822 log.debug(f"Annotation '{annotation}' not in data") 6823 continue 6824 else: 6825 log.debug(f"Annotation '{annotation}' in data") 6826 6827 # For each criterions 6828 for criterion in prioritizations_config[profile][ 6829 annotation 6830 ]: 6831 criterion_type = criterion["type"] 6832 criterion_value = criterion["value"] 6833 criterion_score = criterion.get("score", 0) 6834 criterion_flag = criterion.get("flag", "PASS") 6835 criterion_flag_bool = criterion_flag == "PASS" 6836 criterion_comment = ( 6837 ", ".join(criterion.get("comment", [])) 6838 .replace("'", "''") 6839 .replace(";", ",") 6840 .replace("\t", " ") 6841 ) 6842 criterion_infos = ( 6843 str(criterion) 6844 .replace("'", "''") 6845 .replace(";", ",") 6846 .replace("\t", " ") 6847 ) 6848 6849 sql_set = [] 6850 sql_set_info = [] 6851 6852 # PZ fields set 6853 if ( 6854 f"PZScore{pzfields_sep}{profile}" 6855 in list_of_pzfields 6856 ): 6857 if prioritization_score_mode == "HOWARD": 6858 sql_set.append( 6859 f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}" 6860 ) 6861 elif prioritization_score_mode == "VaRank": 6862 sql_set.append( 6863 f"PZScore{pzfields_sep}{profile} = CASE WHEN {criterion_score}>PZScore{pzfields_sep}{profile} THEN {criterion_score} END" 6864 ) 6865 else: 6866 sql_set.append( 6867 f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}" 6868 ) 6869 if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields: 6870 sql_set.append( 6871 f"PZFlag{pzfields_sep}{profile} = PZFlag{pzfields_sep}{profile} AND {criterion_flag_bool}" 6872 ) 6873 if ( 6874 f"PZComment{pzfields_sep}{profile}" 6875 in list_of_pzfields 6876 ): 6877 sql_set.append( 6878 f""" 6879 PZComment{pzfields_sep}{profile} = 6880 concat( 6881 PZComment{pzfields_sep}{profile}, 6882 CASE 6883 WHEN PZComment{pzfields_sep}{profile}!='' 6884 THEN ', ' 6885 ELSE '' 6886 END, 6887 '{criterion_comment}' 6888 ) 6889 """ 6890 ) 6891 if ( 6892 f"PZInfos{pzfields_sep}{profile}" 6893 in list_of_pzfields 6894 ): 6895 sql_set.append( 6896 f""" 6897 PZInfos{pzfields_sep}{profile} = 6898 concat( 6899 PZInfos{pzfields_sep}{profile}, 6900 '{criterion_infos}' 6901 ) 6902 """ 6903 ) 6904 sql_set_option = ",".join(sql_set) 6905 6906 # Criterion and comparison 6907 try: 6908 float(criterion_value) 6909 sql_update = f""" 6910 UPDATE {table_variants} 6911 SET {sql_set_option} 6912 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 6913 AND "{explode_infos_prefix}{annotation}"{comparison_map[criterion_type]}{criterion_value} 6914 """ 6915 except: 6916 contains_option = "" 6917 if criterion_type == "contains": 6918 contains_option = ".*" 6919 sql_update = f""" 6920 UPDATE {table_variants} 6921 SET {sql_set_option} 6922 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 6923 """ 6924 sql_queries.append(sql_update) 6925 6926 # PZTags 6927 if f"PZTags{pzfields_sep}{profile}" in list_of_pzfields: 6928 6929 # Create PZFalgs value 6930 pztags_value = "" 6931 pztags_sep_default = "|" 6932 pztags_sep = "" 6933 for pzfield in pzfields: 6934 if pzfield not in ["PZTags"]: 6935 if ( 6936 f"{pzfield}{pzfields_sep}{profile}" 6937 in list_of_pzfields 6938 ): 6939 if pzfield in ["PZFlag"]: 6940 pztags_value += f"""{pztags_sep}{pzfield}#', 6941 CASE WHEN PZFlag{pzfields_sep}{profile} 6942 THEN 'PASS' 6943 ELSE 'FILTERED' 6944 END, '""" 6945 else: 6946 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 6947 pztags_sep = pztags_sep_default 6948 6949 # Add Query update for PZFlags 6950 sql_update_pztags = f""" 6951 UPDATE {table_variants} 6952 SET INFO = concat( 6953 INFO, 6954 CASE WHEN INFO NOT in ('','.') 6955 THEN ';' 6956 ELSE '' 6957 END, 6958 'PZTags{pzfields_sep}{profile}={pztags_value}' 6959 ) 6960 """ 6961 sql_queries.append(sql_update_pztags) 6962 6963 # Add Query update for PZFlags for default 6964 if profile == default_profile: 6965 sql_update_pztags_default = f""" 6966 UPDATE {table_variants} 6967 SET INFO = concat( 6968 INFO, 6969 ';', 6970 'PZTags={pztags_value}' 6971 ) 6972 """ 6973 sql_queries.append(sql_update_pztags_default) 6974 6975 log.info(f"""Profile '{profile}' - Prioritization... """) 6976 6977 if sql_queries: 6978 6979 for sql_query in sql_queries: 6980 log.debug( 6981 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 6982 ) 6983 self.conn.execute(sql_query) 6984 6985 log.info(f"""Profile '{profile}' - Update... """) 6986 sql_query_update = f""" 6987 UPDATE {table_variants} 6988 SET INFO = 6989 concat( 6990 CASE 6991 WHEN INFO NOT IN ('','.') 6992 THEN concat(INFO, ';') 6993 ELSE '' 6994 END 6995 {sql_set_info_option} 6996 ) 6997 """ 6998 self.conn.execute(sql_query_update) 6999 7000 else: 7001 7002 log.warning(f"No profiles in parameters") 7003 7004 # Remove added columns 7005 for added_column in added_columns: 7006 self.drop_column(column=added_column) 7007 7008 # Explode INFOS fields into table fields 7009 if self.get_explode_infos(): 7010 self.explode_infos( 7011 prefix=self.get_explode_infos_prefix(), 7012 fields=self.get_explode_infos_fields(), 7013 force=True, 7014 ) 7015 7016 return 7017 7018 ### 7019 # HGVS 7020 ### 7021 7022 def annotation_hgvs(self, threads: int = None) -> None: 7023 """ 7024 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7025 coordinates and alleles. 7026 7027 :param threads: The `threads` parameter is an optional integer that specifies the number of 7028 threads to use for parallel processing. If no value is provided, it will default to the number 7029 of threads obtained from the `get_threads()` method 7030 :type threads: int 7031 """ 7032 7033 # Function for each partition of the Dask Dataframe 7034 def partition_function(partition): 7035 """ 7036 The function `partition_function` applies the `annotation_hgvs_partition` function to 7037 each row of a DataFrame called `partition`. 7038 7039 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7040 to be processed 7041 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7042 the "partition" dataframe along the axis 1. 7043 """ 7044 return partition.apply(annotation_hgvs_partition, axis=1) 7045 7046 def annotation_hgvs_partition(row) -> str: 7047 """ 7048 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7049 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7050 7051 :param row: A dictionary-like object that contains the values for the following keys: 7052 :return: a string that contains the HGVS names associated with the given row of data. 7053 """ 7054 7055 chr = row["CHROM"] 7056 pos = row["POS"] 7057 ref = row["REF"] 7058 alt = row["ALT"] 7059 7060 # Find list of associated transcripts 7061 transcripts_list = list( 7062 polars_conn.execute( 7063 f""" 7064 SELECT transcript 7065 FROM refseq_df 7066 WHERE CHROM='{chr}' 7067 AND POS={pos} 7068 """ 7069 )["transcript"] 7070 ) 7071 7072 # Full HGVS annotation in list 7073 hgvs_full_list = [] 7074 7075 for transcript_name in transcripts_list: 7076 7077 # Transcript 7078 transcript = get_transcript( 7079 transcripts=transcripts, transcript_name=transcript_name 7080 ) 7081 # Exon 7082 if use_exon: 7083 exon = transcript.find_exon_number(pos) 7084 else: 7085 exon = None 7086 # Protein 7087 transcript_protein = None 7088 if use_protein or add_protein or full_format: 7089 transcripts_protein = list( 7090 polars_conn.execute( 7091 f""" 7092 SELECT protein 7093 FROM refseqlink_df 7094 WHERE transcript='{transcript_name}' 7095 LIMIT 1 7096 """ 7097 )["protein"] 7098 ) 7099 if len(transcripts_protein): 7100 transcript_protein = transcripts_protein[0] 7101 7102 # HGVS name 7103 hgvs_name = format_hgvs_name( 7104 chr, 7105 pos, 7106 ref, 7107 alt, 7108 genome=genome, 7109 transcript=transcript, 7110 transcript_protein=transcript_protein, 7111 exon=exon, 7112 use_gene=use_gene, 7113 use_protein=use_protein, 7114 full_format=full_format, 7115 use_version=use_version, 7116 codon_type=codon_type, 7117 ) 7118 hgvs_full_list.append(hgvs_name) 7119 if add_protein and not use_protein and not full_format: 7120 hgvs_name = format_hgvs_name( 7121 chr, 7122 pos, 7123 ref, 7124 alt, 7125 genome=genome, 7126 transcript=transcript, 7127 transcript_protein=transcript_protein, 7128 exon=exon, 7129 use_gene=use_gene, 7130 use_protein=True, 7131 full_format=False, 7132 use_version=use_version, 7133 codon_type=codon_type, 7134 ) 7135 hgvs_full_list.append(hgvs_name) 7136 7137 # Create liste of HGVS annotations 7138 hgvs_full = ",".join(hgvs_full_list) 7139 7140 return hgvs_full 7141 7142 # Polars connexion 7143 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7144 7145 # Config 7146 config = self.get_config() 7147 7148 # Databases 7149 # Genome 7150 databases_genomes_folders = ( 7151 config.get("folders", {}) 7152 .get("databases", {}) 7153 .get("genomes", DEFAULT_GENOME_FOLDER) 7154 ) 7155 databases_genome = ( 7156 config.get("folders", {}).get("databases", {}).get("genomes", "") 7157 ) 7158 # refseq database folder 7159 databases_refseq_folders = ( 7160 config.get("folders", {}) 7161 .get("databases", {}) 7162 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7163 ) 7164 # refseq 7165 databases_refseq = config.get("databases", {}).get("refSeq", None) 7166 # refSeqLink 7167 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7168 7169 # Param 7170 param = self.get_param() 7171 7172 # Quick HGVS 7173 if "hgvs_options" in param and param.get("hgvs_options", ""): 7174 log.info(f"Quick HGVS Annotation:") 7175 if not param.get("hgvs", None): 7176 param["hgvs"] = {} 7177 for option in param.get("hgvs_options", "").split(","): 7178 option_var_val = option.split("=") 7179 option_var = option_var_val[0] 7180 if len(option_var_val) > 1: 7181 option_val = option_var_val[1] 7182 else: 7183 option_val = "True" 7184 if option_val.upper() in ["TRUE"]: 7185 option_val = True 7186 elif option_val.upper() in ["FALSE"]: 7187 option_val = False 7188 log.info(f" {option_var}={option_val}") 7189 param["hgvs"][option_var] = option_val 7190 7191 # Check if HGVS annotation enabled 7192 if "hgvs" in param: 7193 log.info(f"HGVS Annotation... ") 7194 for hgvs_option in param.get("hgvs", {}): 7195 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7196 else: 7197 return 7198 7199 # HGVS Param 7200 param_hgvs = param.get("hgvs", {}) 7201 use_exon = param_hgvs.get("use_exon", False) 7202 use_gene = param_hgvs.get("use_gene", False) 7203 use_protein = param_hgvs.get("use_protein", False) 7204 add_protein = param_hgvs.get("add_protein", False) 7205 full_format = param_hgvs.get("full_format", False) 7206 use_version = param_hgvs.get("use_version", False) 7207 codon_type = param_hgvs.get("codon_type", "3") 7208 7209 # refSseq refSeqLink 7210 databases_refseq = param_hgvs.get("refseq", databases_refseq) 7211 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 7212 7213 # Assembly 7214 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 7215 7216 # Genome 7217 genome_file = None 7218 if find_genome(databases_genome): 7219 genome_file = find_genome(databases_genome) 7220 else: 7221 genome_file = find_genome( 7222 genome_path=databases_genomes_folders, assembly=assembly 7223 ) 7224 log.debug("Genome: " + str(genome_file)) 7225 7226 # refSseq 7227 refseq_file = find_file_prefix( 7228 input_file=databases_refseq, 7229 prefix="ncbiRefSeq", 7230 folder=databases_refseq_folders, 7231 assembly=assembly, 7232 ) 7233 log.debug("refSeq: " + str(refseq_file)) 7234 7235 # refSeqLink 7236 refseqlink_file = find_file_prefix( 7237 input_file=databases_refseqlink, 7238 prefix="ncbiRefSeqLink", 7239 folder=databases_refseq_folders, 7240 assembly=assembly, 7241 ) 7242 log.debug("refSeqLink: " + str(refseqlink_file)) 7243 7244 # Threads 7245 if not threads: 7246 threads = self.get_threads() 7247 log.debug("Threads: " + str(threads)) 7248 7249 # Variables 7250 table_variants = self.get_table_variants(clause="update") 7251 7252 # Get variants SNV and InDel only 7253 query_variants = f""" 7254 SELECT "#CHROM" AS CHROM, POS, REF, ALT 7255 FROM {table_variants} 7256 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 7257 """ 7258 df_variants = self.get_query_to_df(query_variants) 7259 7260 # Added columns 7261 added_columns = [] 7262 7263 # Add hgvs column in variants table 7264 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 7265 added_column = self.add_column( 7266 table_variants, hgvs_column_name, "STRING", default_value=None 7267 ) 7268 added_columns.append(added_column) 7269 7270 log.debug(f"refSeq loading...") 7271 # refSeq in duckDB 7272 refseq_table = get_refseq_table( 7273 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 7274 ) 7275 # Loading all refSeq in Dataframe 7276 refseq_query = f""" 7277 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 7278 FROM {refseq_table} 7279 JOIN df_variants ON ( 7280 {refseq_table}.chrom = df_variants.CHROM 7281 AND {refseq_table}.txStart<=df_variants.POS 7282 AND {refseq_table}.txEnd>=df_variants.POS 7283 ) 7284 """ 7285 refseq_df = self.conn.query(refseq_query).pl() 7286 7287 if refseqlink_file: 7288 log.debug(f"refSeqLink loading...") 7289 # refSeqLink in duckDB 7290 refseqlink_table = get_refseq_table( 7291 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 7292 ) 7293 # Loading all refSeqLink in Dataframe 7294 protacc_column = "protAcc_with_ver" 7295 mrnaacc_column = "mrnaAcc_with_ver" 7296 refseqlink_query = f""" 7297 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 7298 FROM {refseqlink_table} 7299 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 7300 WHERE protAcc_without_ver IS NOT NULL 7301 """ 7302 # Polars Dataframe 7303 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 7304 7305 # Read RefSeq transcripts into a python dict/model. 7306 log.debug(f"Transcripts loading...") 7307 with tempfile.TemporaryDirectory() as tmpdir: 7308 transcripts_query = f""" 7309 COPY ( 7310 SELECT {refseq_table}.* 7311 FROM {refseq_table} 7312 JOIN df_variants ON ( 7313 {refseq_table}.chrom=df_variants.CHROM 7314 AND {refseq_table}.txStart<=df_variants.POS 7315 AND {refseq_table}.txEnd>=df_variants.POS 7316 ) 7317 ) 7318 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 7319 """ 7320 self.conn.query(transcripts_query) 7321 with open(f"{tmpdir}/transcript.tsv") as infile: 7322 transcripts = read_transcripts(infile) 7323 7324 # Polars connexion 7325 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7326 7327 log.debug("Genome loading...") 7328 # Read genome sequence using pyfaidx. 7329 genome = Fasta(genome_file) 7330 7331 log.debug("Start annotation HGVS...") 7332 7333 # Create 7334 # a Dask Dataframe from Pandas dataframe with partition as number of threads 7335 ddf = dd.from_pandas(df_variants, npartitions=threads) 7336 7337 # Use dask.dataframe.apply() to apply function on each partition 7338 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 7339 7340 # Convert Dask DataFrame to Pandas Dataframe 7341 df = ddf.compute() 7342 7343 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 7344 with tempfile.TemporaryDirectory() as tmpdir: 7345 df_parquet = os.path.join(tmpdir, "df.parquet") 7346 df.to_parquet(df_parquet) 7347 7348 # Update hgvs column 7349 update_variant_query = f""" 7350 UPDATE {table_variants} 7351 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 7352 FROM read_parquet('{df_parquet}') as df 7353 WHERE variants."#CHROM" = df.CHROM 7354 AND variants.POS = df.POS 7355 AND variants.REF = df.REF 7356 AND variants.ALT = df.ALT 7357 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 7358 """ 7359 self.execute_query(update_variant_query) 7360 7361 # Update INFO column 7362 sql_query_update = f""" 7363 UPDATE {table_variants} 7364 SET INFO = 7365 concat( 7366 CASE 7367 WHEN INFO NOT IN ('','.') 7368 THEN concat(INFO, ';') 7369 ELSE '' 7370 END, 7371 'hgvs=', 7372 {hgvs_column_name} 7373 ) 7374 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 7375 """ 7376 self.execute_query(sql_query_update) 7377 7378 # Add header 7379 HGVS_INFOS = { 7380 "hgvs": { 7381 "ID": "hgvs", 7382 "Number": ".", 7383 "Type": "String", 7384 "Description": f"HGVS annotatation with HOWARD", 7385 } 7386 } 7387 7388 for field in HGVS_INFOS: 7389 field_ID = HGVS_INFOS[field]["ID"] 7390 field_description = HGVS_INFOS[field]["Description"] 7391 self.get_header().infos[field_ID] = vcf.parser._Info( 7392 field_ID, 7393 HGVS_INFOS[field]["Number"], 7394 HGVS_INFOS[field]["Type"], 7395 field_description, 7396 "unknown", 7397 "unknown", 7398 code_type_map[HGVS_INFOS[field]["Type"]], 7399 ) 7400 7401 # Remove added columns 7402 for added_column in added_columns: 7403 self.drop_column(column=added_column) 7404 7405 ### 7406 # Calculation 7407 ### 7408 7409 def get_operations_help( 7410 self, operations_config_dict: dict = {}, operations_config_file: str = None 7411 ) -> list: 7412 7413 # Init 7414 operations_help = [] 7415 7416 # operations 7417 operations = self.get_config_json( 7418 name="calculations", 7419 config_dict=operations_config_dict, 7420 config_file=operations_config_file, 7421 ) 7422 for op in operations: 7423 op_name = operations[op].get("name", op).upper() 7424 op_description = operations[op].get("description", op_name) 7425 op_available = operations[op].get("available", False) 7426 if op_available: 7427 operations_help.append(f" {op_name}: {op_description}") 7428 7429 # Sort operations 7430 operations_help.sort() 7431 7432 # insert header 7433 operations_help.insert(0, "Available calculation operations:") 7434 7435 # Return 7436 return operations_help 7437 7438 def calculation( 7439 self, 7440 operations: dict = {}, 7441 operations_config_dict: dict = {}, 7442 operations_config_file: str = None, 7443 ) -> None: 7444 """ 7445 It takes a list of operations, and for each operation, it checks if it's a python or sql 7446 operation, and then calls the appropriate function 7447 7448 param json example: 7449 "calculation": { 7450 "NOMEN": { 7451 "options": { 7452 "hgvs_field": "hgvs" 7453 }, 7454 "middle" : null 7455 } 7456 """ 7457 7458 # Param 7459 param = self.get_param() 7460 7461 # operations config 7462 operations_config = self.get_config_json( 7463 name="calculations", 7464 config_dict=operations_config_dict, 7465 config_file=operations_config_file, 7466 ) 7467 7468 # Upper keys 7469 operations_config = {k.upper(): v for k, v in operations_config.items()} 7470 7471 # Calculations 7472 7473 # Operations from param 7474 operations = param.get("calculation", {}).get("calculations", operations) 7475 7476 # Quick calculation - add 7477 if param.get("calculations", None): 7478 calculations_list = [ 7479 value for value in param.get("calculations", "").split(",") 7480 ] 7481 log.info(f"Quick Calculations:") 7482 for calculation_key in calculations_list: 7483 log.info(f" {calculation_key}") 7484 for calculation_operation in calculations_list: 7485 if calculation_operation.upper() not in operations: 7486 operations[calculation_operation.upper()] = {} 7487 add_value_into_dict( 7488 dict_tree=param, 7489 sections=[ 7490 "calculation", 7491 "calculations", 7492 calculation_operation.upper(), 7493 ], 7494 value={}, 7495 ) 7496 7497 # Operations for calculation 7498 if not operations: 7499 operations = param.get("calculation", {}).get("calculations", {}) 7500 7501 if operations: 7502 log.info(f"Calculations...") 7503 7504 # For each operations 7505 for operation_name in operations: 7506 operation_name = operation_name.upper() 7507 if operation_name not in [""]: 7508 if operation_name in operations_config: 7509 log.info(f"Calculation '{operation_name}'") 7510 operation = operations_config[operation_name] 7511 operation_type = operation.get("type", "sql") 7512 if operation_type == "python": 7513 self.calculation_process_function( 7514 operation=operation, operation_name=operation_name 7515 ) 7516 elif operation_type == "sql": 7517 self.calculation_process_sql( 7518 operation=operation, operation_name=operation_name 7519 ) 7520 else: 7521 log.error( 7522 f"Operations config: Type '{operation_type}' NOT available" 7523 ) 7524 raise ValueError( 7525 f"Operations config: Type '{operation_type}' NOT available" 7526 ) 7527 else: 7528 log.error( 7529 f"Operations config: Calculation '{operation_name}' NOT available" 7530 ) 7531 raise ValueError( 7532 f"Operations config: Calculation '{operation_name}' NOT available" 7533 ) 7534 7535 # Explode INFOS fields into table fields 7536 if self.get_explode_infos(): 7537 self.explode_infos( 7538 prefix=self.get_explode_infos_prefix(), 7539 fields=self.get_explode_infos_fields(), 7540 force=True, 7541 ) 7542 7543 def calculation_process_sql( 7544 self, operation: dict, operation_name: str = "unknown" 7545 ) -> None: 7546 """ 7547 The `calculation_process_sql` function takes in a mathematical operation as a string and 7548 performs the operation, updating the specified table with the result. 7549 7550 :param operation: The `operation` parameter is a dictionary that contains information about the 7551 mathematical operation to be performed. It includes the following keys: 7552 :type operation: dict 7553 :param operation_name: The `operation_name` parameter is a string that represents the name of 7554 the mathematical operation being performed. It is used for logging and error handling purposes, 7555 defaults to unknown 7556 :type operation_name: str (optional) 7557 """ 7558 7559 # table variants 7560 table_variants = self.get_table_variants(clause="alter") 7561 7562 # Operation infos 7563 operation_name = operation.get("name", "unknown") 7564 log.debug(f"process sql {operation_name}") 7565 output_column_name = operation.get("output_column_name", operation_name) 7566 output_column_type = operation.get("output_column_type", "String") 7567 prefix = operation.get("explode_infos_prefix", "") 7568 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 7569 output_column_description = operation.get( 7570 "output_column_description", f"{operation_name} operation" 7571 ) 7572 operation_query = operation.get("operation_query", None) 7573 if isinstance(operation_query, list): 7574 operation_query = " ".join(operation_query) 7575 operation_info_fields = operation.get("info_fields", []) 7576 operation_info_fields_check = operation.get("info_fields_check", False) 7577 operation_info = operation.get("operation_info", True) 7578 7579 if operation_query: 7580 7581 # Info fields check 7582 operation_info_fields_check_result = True 7583 if operation_info_fields_check: 7584 header_infos = self.get_header().infos 7585 for info_field in operation_info_fields: 7586 operation_info_fields_check_result = ( 7587 operation_info_fields_check_result 7588 and info_field in header_infos 7589 ) 7590 7591 # If info fields available 7592 if operation_info_fields_check_result: 7593 7594 # Added_columns 7595 added_columns = [] 7596 7597 # Create VCF header field 7598 vcf_reader = self.get_header() 7599 vcf_reader.infos[output_column_name] = vcf.parser._Info( 7600 output_column_name, 7601 ".", 7602 output_column_type, 7603 output_column_description, 7604 "howard calculation", 7605 "0", 7606 self.code_type_map.get(output_column_type), 7607 ) 7608 7609 # Explode infos if needed 7610 log.debug(f"calculation_process_sql prefix {prefix}") 7611 added_columns += self.explode_infos( 7612 prefix=prefix, 7613 fields=[output_column_name] + operation_info_fields, 7614 force=True, 7615 ) 7616 7617 # Create column 7618 added_column = self.add_column( 7619 table_name=table_variants, 7620 column_name=prefix + output_column_name, 7621 column_type=output_column_type_sql, 7622 default_value="null", 7623 ) 7624 added_columns.append(added_column) 7625 7626 # Operation calculation 7627 try: 7628 7629 # Query to update calculation column 7630 sql_update = f""" 7631 UPDATE {table_variants} 7632 SET "{prefix}{output_column_name}" = ({operation_query}) 7633 """ 7634 self.conn.execute(sql_update) 7635 7636 # Add to INFO 7637 if operation_info: 7638 sql_update_info = f""" 7639 UPDATE {table_variants} 7640 SET "INFO" = 7641 concat( 7642 CASE 7643 WHEN "INFO" IS NOT NULL 7644 THEN concat("INFO", ';') 7645 ELSE '' 7646 END, 7647 '{output_column_name}=', 7648 "{prefix}{output_column_name}" 7649 ) 7650 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 7651 """ 7652 self.conn.execute(sql_update_info) 7653 7654 except: 7655 log.error( 7656 f"Operations config: Calculation '{operation_name}' query failed" 7657 ) 7658 raise ValueError( 7659 f"Operations config: Calculation '{operation_name}' query failed" 7660 ) 7661 7662 # Remove added columns 7663 for added_column in added_columns: 7664 log.debug(f"added_column: {added_column}") 7665 self.drop_column(column=added_column) 7666 7667 else: 7668 log.error( 7669 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7670 ) 7671 raise ValueError( 7672 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7673 ) 7674 7675 else: 7676 log.error( 7677 f"Operations config: Calculation '{operation_name}' query NOT defined" 7678 ) 7679 raise ValueError( 7680 f"Operations config: Calculation '{operation_name}' query NOT defined" 7681 ) 7682 7683 def calculation_process_function( 7684 self, operation: dict, operation_name: str = "unknown" 7685 ) -> None: 7686 """ 7687 The `calculation_process_function` takes in an operation dictionary and performs the specified 7688 function with the given parameters. 7689 7690 :param operation: The `operation` parameter is a dictionary that contains information about the 7691 operation to be performed. It has the following keys: 7692 :type operation: dict 7693 :param operation_name: The `operation_name` parameter is a string that represents the name of 7694 the operation being performed. It is used for logging purposes, defaults to unknown 7695 :type operation_name: str (optional) 7696 """ 7697 7698 operation_name = operation["name"] 7699 log.debug(f"process sql {operation_name}") 7700 function_name = operation["function_name"] 7701 function_params = operation["function_params"] 7702 getattr(self, function_name)(*function_params) 7703 7704 def calculation_variant_id(self) -> None: 7705 """ 7706 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 7707 updates the INFO field of a variants table with the variant ID. 7708 """ 7709 7710 # variant_id annotation field 7711 variant_id_tag = self.get_variant_id_column() 7712 added_columns = [variant_id_tag] 7713 7714 # variant_id hgvs tags" 7715 vcf_infos_tags = { 7716 variant_id_tag: "howard variant ID annotation", 7717 } 7718 7719 # Variants table 7720 table_variants = self.get_table_variants() 7721 7722 # Header 7723 vcf_reader = self.get_header() 7724 7725 # Add variant_id to header 7726 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 7727 variant_id_tag, 7728 ".", 7729 "String", 7730 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 7731 "howard calculation", 7732 "0", 7733 self.code_type_map.get("String"), 7734 ) 7735 7736 # Update 7737 sql_update = f""" 7738 UPDATE {table_variants} 7739 SET "INFO" = 7740 concat( 7741 CASE 7742 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 7743 THEN '' 7744 ELSE concat("INFO", ';') 7745 END, 7746 '{variant_id_tag}=', 7747 "{variant_id_tag}" 7748 ) 7749 """ 7750 self.conn.execute(sql_update) 7751 7752 # Remove added columns 7753 for added_column in added_columns: 7754 self.drop_column(column=added_column) 7755 7756 def calculation_extract_snpeff_hgvs(self) -> None: 7757 """ 7758 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 7759 annotation field in a VCF file and adds them as a new column in the variants table. 7760 """ 7761 7762 # SnpEff annotation field 7763 snpeff_ann = "ANN" 7764 7765 # SnpEff annotation field 7766 snpeff_hgvs = "snpeff_hgvs" 7767 7768 # Snpeff hgvs tags 7769 vcf_infos_tags = { 7770 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 7771 } 7772 7773 # Prefix 7774 prefix = self.get_explode_infos_prefix() 7775 if prefix: 7776 prefix = "INFO/" 7777 7778 # snpEff fields 7779 speff_ann_infos = prefix + snpeff_ann 7780 speff_hgvs_infos = prefix + snpeff_hgvs 7781 7782 # Variants table 7783 table_variants = self.get_table_variants() 7784 7785 # Header 7786 vcf_reader = self.get_header() 7787 7788 # Add columns 7789 added_columns = [] 7790 7791 # Explode HGVS field in column 7792 added_columns += self.explode_infos(fields=[snpeff_ann]) 7793 7794 if "ANN" in vcf_reader.infos: 7795 7796 log.debug(vcf_reader.infos["ANN"]) 7797 7798 # Create variant id 7799 variant_id_column = self.get_variant_id_column() 7800 added_columns += [variant_id_column] 7801 7802 # Create dataframe 7803 dataframe_snpeff_hgvs = self.get_query_to_df( 7804 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 7805 ) 7806 7807 # Create main NOMEN column 7808 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 7809 speff_ann_infos 7810 ].apply(lambda x: extract_snpeff_hgvs(str(x))) 7811 7812 # Add snpeff_hgvs to header 7813 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 7814 snpeff_hgvs, 7815 ".", 7816 "String", 7817 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 7818 "howard calculation", 7819 "0", 7820 self.code_type_map.get("String"), 7821 ) 7822 7823 # Update 7824 sql_update = f""" 7825 UPDATE variants 7826 SET "INFO" = 7827 concat( 7828 CASE 7829 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 7830 THEN '' 7831 ELSE concat("INFO", ';') 7832 END, 7833 CASE 7834 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 7835 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 7836 THEN concat( 7837 '{snpeff_hgvs}=', 7838 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 7839 ) 7840 ELSE '' 7841 END 7842 ) 7843 FROM dataframe_snpeff_hgvs 7844 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 7845 7846 """ 7847 self.conn.execute(sql_update) 7848 7849 # Delete dataframe 7850 del dataframe_snpeff_hgvs 7851 gc.collect() 7852 7853 else: 7854 7855 log.warning( 7856 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 7857 ) 7858 7859 # Remove added columns 7860 for added_column in added_columns: 7861 self.drop_column(column=added_column) 7862 7863 def calculation_extract_nomen(self) -> None: 7864 """ 7865 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 7866 """ 7867 7868 # NOMEN field 7869 field_nomen_dict = "NOMEN_DICT" 7870 7871 # NOMEN structure 7872 nomen_dict = { 7873 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 7874 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 7875 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 7876 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 7877 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 7878 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 7879 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 7880 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 7881 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 7882 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 7883 } 7884 7885 # Param 7886 param = self.get_param() 7887 7888 # Prefix 7889 prefix = self.get_explode_infos_prefix() 7890 7891 # Header 7892 vcf_reader = self.get_header() 7893 7894 # Get HGVS field 7895 hgvs_field = ( 7896 param.get("calculation", {}) 7897 .get("calculations", {}) 7898 .get("NOMEN", {}) 7899 .get("options", {}) 7900 .get("hgvs_field", "hgvs") 7901 ) 7902 7903 # Get transcripts 7904 transcripts_file = ( 7905 param.get("calculation", {}) 7906 .get("calculations", {}) 7907 .get("NOMEN", {}) 7908 .get("options", {}) 7909 .get("transcripts", None) 7910 ) 7911 transcripts_file = full_path(transcripts_file) 7912 transcripts = [] 7913 if transcripts_file: 7914 if os.path.exists(transcripts_file): 7915 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 7916 transcripts = transcripts_dataframe.iloc[:, 0].tolist() 7917 else: 7918 log.error(f"Transcript file '{transcripts_file}' does NOT exist") 7919 raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist") 7920 7921 # Added columns 7922 added_columns = [] 7923 7924 # Explode HGVS field in column 7925 added_columns += self.explode_infos(fields=[hgvs_field]) 7926 7927 # extra infos 7928 extra_infos = self.get_extra_infos() 7929 extra_field = prefix + hgvs_field 7930 7931 if extra_field in extra_infos: 7932 7933 # Create dataframe 7934 dataframe_hgvs = self.get_query_to_df( 7935 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """ 7936 ) 7937 7938 # Create main NOMEN column 7939 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply( 7940 lambda x: find_nomen(str(x), transcripts=transcripts) 7941 ) 7942 7943 # Explode NOMEN Structure and create SQL set for update 7944 sql_nomen_fields = [] 7945 for nomen_field in nomen_dict: 7946 7947 # Explode each field into a column 7948 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 7949 lambda x: dict(x).get(nomen_field, "") 7950 ) 7951 7952 # Create VCF header field 7953 vcf_reader.infos[nomen_field] = vcf.parser._Info( 7954 nomen_field, 7955 ".", 7956 "String", 7957 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 7958 "howard calculation", 7959 "0", 7960 self.code_type_map.get("String"), 7961 ) 7962 sql_nomen_fields.append( 7963 f""" 7964 CASE 7965 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 7966 THEN concat( 7967 ';{nomen_field}=', 7968 dataframe_hgvs."{nomen_field}" 7969 ) 7970 ELSE '' 7971 END 7972 """ 7973 ) 7974 7975 # SQL set for update 7976 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 7977 7978 # Update 7979 sql_update = f""" 7980 UPDATE variants 7981 SET "INFO" = 7982 concat( 7983 CASE 7984 WHEN "INFO" IS NULL 7985 THEN '' 7986 ELSE "INFO" 7987 END, 7988 {sql_nomen_fields_set} 7989 ) 7990 FROM dataframe_hgvs 7991 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 7992 AND variants."POS" = dataframe_hgvs."POS" 7993 AND variants."REF" = dataframe_hgvs."REF" 7994 AND variants."ALT" = dataframe_hgvs."ALT" 7995 """ 7996 self.conn.execute(sql_update) 7997 7998 # Delete dataframe 7999 del dataframe_hgvs 8000 gc.collect() 8001 8002 # Remove added columns 8003 for added_column in added_columns: 8004 self.drop_column(column=added_column) 8005 8006 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 8007 """ 8008 The function `calculation_find_by_pipeline` performs a calculation to find the number of 8009 pipeline/sample for a variant and updates the variant information in a VCF file. 8010 8011 :param tag: The `tag` parameter is a string that represents the annotation field for the 8012 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 8013 VCF header and to update the corresponding field in the variants table, defaults to 8014 findbypipeline 8015 :type tag: str (optional) 8016 """ 8017 8018 # if FORMAT and samples 8019 if ( 8020 "FORMAT" in self.get_header_columns_as_list() 8021 and self.get_header_sample_list() 8022 ): 8023 8024 # findbypipeline annotation field 8025 findbypipeline_tag = tag 8026 8027 # VCF infos tags 8028 vcf_infos_tags = { 8029 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 8030 } 8031 8032 # Prefix 8033 prefix = self.get_explode_infos_prefix() 8034 8035 # Field 8036 findbypipeline_infos = prefix + findbypipeline_tag 8037 8038 # Variants table 8039 table_variants = self.get_table_variants() 8040 8041 # Header 8042 vcf_reader = self.get_header() 8043 8044 # Create variant id 8045 variant_id_column = self.get_variant_id_column() 8046 added_columns = [variant_id_column] 8047 8048 # variant_id, FORMAT and samples 8049 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8050 self.get_header_sample_list() 8051 ) 8052 8053 # Create dataframe 8054 dataframe_findbypipeline = self.get_query_to_df( 8055 f""" SELECT {samples_fields} FROM {table_variants} """ 8056 ) 8057 8058 # Create findbypipeline column 8059 dataframe_findbypipeline[findbypipeline_infos] = ( 8060 dataframe_findbypipeline.apply( 8061 lambda row: findbypipeline( 8062 row, samples=self.get_header_sample_list() 8063 ), 8064 axis=1, 8065 ) 8066 ) 8067 8068 # Add snpeff_hgvs to header 8069 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 8070 findbypipeline_tag, 8071 ".", 8072 "String", 8073 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 8074 "howard calculation", 8075 "0", 8076 self.code_type_map.get("String"), 8077 ) 8078 8079 # Update 8080 sql_update = f""" 8081 UPDATE variants 8082 SET "INFO" = 8083 concat( 8084 CASE 8085 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8086 THEN '' 8087 ELSE concat("INFO", ';') 8088 END, 8089 CASE 8090 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 8091 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 8092 THEN concat( 8093 '{findbypipeline_tag}=', 8094 dataframe_findbypipeline."{findbypipeline_infos}" 8095 ) 8096 ELSE '' 8097 END 8098 ) 8099 FROM dataframe_findbypipeline 8100 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 8101 """ 8102 self.conn.execute(sql_update) 8103 8104 # Remove added columns 8105 for added_column in added_columns: 8106 self.drop_column(column=added_column) 8107 8108 # Delete dataframe 8109 del dataframe_findbypipeline 8110 gc.collect() 8111 8112 def calculation_genotype_concordance(self) -> None: 8113 """ 8114 The function `calculation_genotype_concordance` calculates the genotype concordance for 8115 multi-caller VCF files and updates the variant information in the database. 8116 """ 8117 8118 # if FORMAT and samples 8119 if ( 8120 "FORMAT" in self.get_header_columns_as_list() 8121 and self.get_header_sample_list() 8122 ): 8123 8124 # genotypeconcordance annotation field 8125 genotypeconcordance_tag = "genotypeconcordance" 8126 8127 # VCF infos tags 8128 vcf_infos_tags = { 8129 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 8130 } 8131 8132 # Prefix 8133 prefix = self.get_explode_infos_prefix() 8134 8135 # Field 8136 genotypeconcordance_infos = prefix + genotypeconcordance_tag 8137 8138 # Variants table 8139 table_variants = self.get_table_variants() 8140 8141 # Header 8142 vcf_reader = self.get_header() 8143 8144 # Create variant id 8145 variant_id_column = self.get_variant_id_column() 8146 added_columns = [variant_id_column] 8147 8148 # variant_id, FORMAT and samples 8149 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8150 self.get_header_sample_list() 8151 ) 8152 8153 # Create dataframe 8154 dataframe_genotypeconcordance = self.get_query_to_df( 8155 f""" SELECT {samples_fields} FROM {table_variants} """ 8156 ) 8157 8158 # Create genotypeconcordance column 8159 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 8160 dataframe_genotypeconcordance.apply( 8161 lambda row: genotypeconcordance( 8162 row, samples=self.get_header_sample_list() 8163 ), 8164 axis=1, 8165 ) 8166 ) 8167 8168 # Add genotypeconcordance to header 8169 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 8170 genotypeconcordance_tag, 8171 ".", 8172 "String", 8173 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 8174 "howard calculation", 8175 "0", 8176 self.code_type_map.get("String"), 8177 ) 8178 8179 # Update 8180 sql_update = f""" 8181 UPDATE variants 8182 SET "INFO" = 8183 concat( 8184 CASE 8185 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8186 THEN '' 8187 ELSE concat("INFO", ';') 8188 END, 8189 CASE 8190 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 8191 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 8192 THEN concat( 8193 '{genotypeconcordance_tag}=', 8194 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 8195 ) 8196 ELSE '' 8197 END 8198 ) 8199 FROM dataframe_genotypeconcordance 8200 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 8201 """ 8202 self.conn.execute(sql_update) 8203 8204 # Remove added columns 8205 for added_column in added_columns: 8206 self.drop_column(column=added_column) 8207 8208 # Delete dataframe 8209 del dataframe_genotypeconcordance 8210 gc.collect() 8211 8212 def calculation_barcode(self, tag: str = "barcode") -> None: 8213 """ 8214 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 8215 updates the INFO field in the file with the calculated barcode values. 8216 """ 8217 8218 # if FORMAT and samples 8219 if ( 8220 "FORMAT" in self.get_header_columns_as_list() 8221 and self.get_header_sample_list() 8222 ): 8223 8224 # barcode annotation field 8225 if not tag: 8226 tag = "barcode" 8227 8228 # VCF infos tags 8229 vcf_infos_tags = { 8230 tag: "barcode calculation (VaRank)", 8231 } 8232 8233 # Prefix 8234 prefix = self.get_explode_infos_prefix() 8235 8236 # Field 8237 barcode_infos = prefix + tag 8238 8239 # Variants table 8240 table_variants = self.get_table_variants() 8241 8242 # Header 8243 vcf_reader = self.get_header() 8244 8245 # Create variant id 8246 variant_id_column = self.get_variant_id_column() 8247 added_columns = [variant_id_column] 8248 8249 # variant_id, FORMAT and samples 8250 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8251 self.get_header_sample_list() 8252 ) 8253 8254 # Create dataframe 8255 dataframe_barcode = self.get_query_to_df( 8256 f""" SELECT {samples_fields} FROM {table_variants} """ 8257 ) 8258 8259 # Create barcode column 8260 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8261 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 8262 ) 8263 8264 # Add barcode to header 8265 vcf_reader.infos[tag] = vcf.parser._Info( 8266 tag, 8267 ".", 8268 "String", 8269 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 8270 "howard calculation", 8271 "0", 8272 self.code_type_map.get("String"), 8273 ) 8274 8275 # Update 8276 sql_update = f""" 8277 UPDATE {table_variants} 8278 SET "INFO" = 8279 concat( 8280 CASE 8281 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8282 THEN '' 8283 ELSE concat("INFO", ';') 8284 END, 8285 CASE 8286 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 8287 AND dataframe_barcode."{barcode_infos}" NOT NULL 8288 THEN concat( 8289 '{tag}=', 8290 dataframe_barcode."{barcode_infos}" 8291 ) 8292 ELSE '' 8293 END 8294 ) 8295 FROM dataframe_barcode 8296 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8297 """ 8298 self.conn.execute(sql_update) 8299 8300 # Remove added columns 8301 for added_column in added_columns: 8302 self.drop_column(column=added_column) 8303 8304 # Delete dataframe 8305 del dataframe_barcode 8306 gc.collect() 8307 8308 def calculation_barcode_family(self, tag: str = "BCF") -> None: 8309 """ 8310 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 8311 and updates the INFO field in the file with the calculated barcode values. 8312 8313 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 8314 the barcode tag that will be added to the VCF file during the calculation process. If no value 8315 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 8316 :type tag: str (optional) 8317 """ 8318 8319 # if FORMAT and samples 8320 if ( 8321 "FORMAT" in self.get_header_columns_as_list() 8322 and self.get_header_sample_list() 8323 ): 8324 8325 # barcode annotation field 8326 if not tag: 8327 tag = "BCF" 8328 8329 # VCF infos tags 8330 vcf_infos_tags = { 8331 tag: "barcode family calculation", 8332 f"{tag}S": "barcode family samples", 8333 } 8334 8335 # Param 8336 param = self.get_param() 8337 log.debug(f"param={param}") 8338 8339 # Prefix 8340 prefix = self.get_explode_infos_prefix() 8341 8342 # PED param 8343 ped = ( 8344 param.get("calculation", {}) 8345 .get("calculations", {}) 8346 .get("BARCODEFAMILY", {}) 8347 .get("family_pedigree", None) 8348 ) 8349 log.debug(f"ped={ped}") 8350 8351 # Load PED 8352 if ped: 8353 8354 # Pedigree is a file 8355 if isinstance(ped, str) and os.path.exists(full_path(ped)): 8356 log.debug("Pedigree is file") 8357 with open(full_path(ped)) as ped: 8358 ped = json.load(ped) 8359 8360 # Pedigree is a string 8361 elif isinstance(ped, str): 8362 log.debug("Pedigree is str") 8363 try: 8364 ped = json.loads(ped) 8365 log.debug("Pedigree is json str") 8366 except ValueError as e: 8367 ped_samples = ped.split(",") 8368 ped = {} 8369 for ped_sample in ped_samples: 8370 ped[ped_sample] = ped_sample 8371 8372 # Pedigree is a dict 8373 elif isinstance(ped, dict): 8374 log.debug("Pedigree is dict") 8375 8376 # Pedigree is not well formatted 8377 else: 8378 msg_error = "Pedigree not well formatted" 8379 log.error(msg_error) 8380 raise ValueError(msg_error) 8381 8382 # Construct list 8383 ped_samples = list(ped.values()) 8384 8385 else: 8386 log.debug("Pedigree not defined. Take all samples") 8387 ped_samples = self.get_header_sample_list() 8388 ped = {} 8389 for ped_sample in ped_samples: 8390 ped[ped_sample] = ped_sample 8391 8392 # Check pedigree 8393 if not ped or len(ped) == 0: 8394 msg_error = f"Error in pedigree: samples {ped_samples}" 8395 log.error(msg_error) 8396 raise ValueError(msg_error) 8397 8398 # Log 8399 log.info( 8400 "Calculation 'BARCODEFAMILY' - Samples: " 8401 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 8402 ) 8403 log.debug(f"ped_samples={ped_samples}") 8404 8405 # Field 8406 barcode_infos = prefix + tag 8407 8408 # Variants table 8409 table_variants = self.get_table_variants() 8410 8411 # Header 8412 vcf_reader = self.get_header() 8413 8414 # Create variant id 8415 variant_id_column = self.get_variant_id_column() 8416 added_columns = [variant_id_column] 8417 8418 # variant_id, FORMAT and samples 8419 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8420 ped_samples 8421 ) 8422 8423 # Create dataframe 8424 dataframe_barcode = self.get_query_to_df( 8425 f""" SELECT {samples_fields} FROM {table_variants} """ 8426 ) 8427 8428 # Create barcode column 8429 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8430 lambda row: barcode(row, samples=ped_samples), axis=1 8431 ) 8432 8433 # Add barcode family to header 8434 # Add vaf_normalization to header 8435 vcf_reader.formats[tag] = vcf.parser._Format( 8436 id=tag, 8437 num=".", 8438 type="String", 8439 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 8440 type_code=self.code_type_map.get("String"), 8441 ) 8442 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 8443 id=f"{tag}S", 8444 num=".", 8445 type="String", 8446 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 8447 type_code=self.code_type_map.get("String"), 8448 ) 8449 8450 # Update 8451 # for sample in ped_samples: 8452 sql_update_set = [] 8453 for sample in self.get_header_sample_list() + ["FORMAT"]: 8454 if sample in ped_samples: 8455 value = f'dataframe_barcode."{barcode_infos}"' 8456 value_samples = "'" + ",".join(ped_samples) + "'" 8457 elif sample == "FORMAT": 8458 value = f"'{tag}'" 8459 value_samples = f"'{tag}S'" 8460 else: 8461 value = "'.'" 8462 value_samples = "'.'" 8463 format_regex = r"[a-zA-Z0-9\s]" 8464 sql_update_set.append( 8465 f""" 8466 "{sample}" = 8467 concat( 8468 CASE 8469 WHEN {table_variants}."{sample}" = './.' 8470 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 8471 ELSE {table_variants}."{sample}" 8472 END, 8473 ':', 8474 {value}, 8475 ':', 8476 {value_samples} 8477 ) 8478 """ 8479 ) 8480 8481 sql_update_set_join = ", ".join(sql_update_set) 8482 sql_update = f""" 8483 UPDATE {table_variants} 8484 SET {sql_update_set_join} 8485 FROM dataframe_barcode 8486 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8487 """ 8488 self.conn.execute(sql_update) 8489 8490 # Remove added columns 8491 for added_column in added_columns: 8492 self.drop_column(column=added_column) 8493 8494 # Delete dataframe 8495 del dataframe_barcode 8496 gc.collect() 8497 8498 def calculation_trio(self) -> None: 8499 """ 8500 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 8501 information to the INFO field of each variant. 8502 """ 8503 8504 # if FORMAT and samples 8505 if ( 8506 "FORMAT" in self.get_header_columns_as_list() 8507 and self.get_header_sample_list() 8508 ): 8509 8510 # trio annotation field 8511 trio_tag = "trio" 8512 8513 # VCF infos tags 8514 vcf_infos_tags = { 8515 "trio": "trio calculation", 8516 } 8517 8518 # Param 8519 param = self.get_param() 8520 8521 # Prefix 8522 prefix = self.get_explode_infos_prefix() 8523 8524 # Trio param 8525 trio_ped = ( 8526 param.get("calculation", {}) 8527 .get("calculations", {}) 8528 .get("TRIO", {}) 8529 .get("trio_pedigree", None) 8530 ) 8531 8532 # Load trio 8533 if trio_ped: 8534 8535 # Trio pedigree is a file 8536 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 8537 log.debug("TRIO pedigree is file") 8538 with open(full_path(trio_ped)) as trio_ped: 8539 trio_ped = json.load(trio_ped) 8540 8541 # Trio pedigree is a string 8542 elif isinstance(trio_ped, str): 8543 log.debug("TRIO pedigree is str") 8544 try: 8545 trio_ped = json.loads(trio_ped) 8546 log.debug("TRIO pedigree is json str") 8547 except ValueError as e: 8548 trio_samples = trio_ped.split(",") 8549 if len(trio_samples) == 3: 8550 trio_ped = { 8551 "father": trio_samples[0], 8552 "mother": trio_samples[1], 8553 "child": trio_samples[2], 8554 } 8555 log.debug("TRIO pedigree is list str") 8556 else: 8557 msg_error = "TRIO pedigree not well formatted" 8558 log.error(msg_error) 8559 raise ValueError(msg_error) 8560 8561 # Trio pedigree is a dict 8562 elif isinstance(trio_ped, dict): 8563 log.debug("TRIO pedigree is dict") 8564 8565 # Trio pedigree is not well formatted 8566 else: 8567 msg_error = "TRIO pedigree not well formatted" 8568 log.error(msg_error) 8569 raise ValueError(msg_error) 8570 8571 # Construct trio list 8572 trio_samples = [ 8573 trio_ped.get("father", ""), 8574 trio_ped.get("mother", ""), 8575 trio_ped.get("child", ""), 8576 ] 8577 8578 else: 8579 log.debug("TRIO pedigree not defined. Take the first 3 samples") 8580 samples_list = self.get_header_sample_list() 8581 if len(samples_list) >= 3: 8582 trio_samples = self.get_header_sample_list()[0:3] 8583 trio_ped = { 8584 "father": trio_samples[0], 8585 "mother": trio_samples[1], 8586 "child": trio_samples[2], 8587 } 8588 else: 8589 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 8590 log.error(msg_error) 8591 raise ValueError(msg_error) 8592 8593 # Check trio pedigree 8594 if not trio_ped or len(trio_ped) != 3: 8595 msg_error = f"Error in TRIO pedigree: {trio_ped}" 8596 log.error(msg_error) 8597 raise ValueError(msg_error) 8598 8599 # Log 8600 log.info( 8601 f"Calculation 'TRIO' - Samples: " 8602 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 8603 ) 8604 8605 # Field 8606 trio_infos = prefix + trio_tag 8607 8608 # Variants table 8609 table_variants = self.get_table_variants() 8610 8611 # Header 8612 vcf_reader = self.get_header() 8613 8614 # Create variant id 8615 variant_id_column = self.get_variant_id_column() 8616 added_columns = [variant_id_column] 8617 8618 # variant_id, FORMAT and samples 8619 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8620 self.get_header_sample_list() 8621 ) 8622 8623 # Create dataframe 8624 dataframe_trio = self.get_query_to_df( 8625 f""" SELECT {samples_fields} FROM {table_variants} """ 8626 ) 8627 8628 # Create trio column 8629 dataframe_trio[trio_infos] = dataframe_trio.apply( 8630 lambda row: trio(row, samples=trio_samples), axis=1 8631 ) 8632 8633 # Add trio to header 8634 vcf_reader.infos[trio_tag] = vcf.parser._Info( 8635 trio_tag, 8636 ".", 8637 "String", 8638 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 8639 "howard calculation", 8640 "0", 8641 self.code_type_map.get("String"), 8642 ) 8643 8644 # Update 8645 sql_update = f""" 8646 UPDATE {table_variants} 8647 SET "INFO" = 8648 concat( 8649 CASE 8650 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8651 THEN '' 8652 ELSE concat("INFO", ';') 8653 END, 8654 CASE 8655 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 8656 AND dataframe_trio."{trio_infos}" NOT NULL 8657 THEN concat( 8658 '{trio_tag}=', 8659 dataframe_trio."{trio_infos}" 8660 ) 8661 ELSE '' 8662 END 8663 ) 8664 FROM dataframe_trio 8665 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 8666 """ 8667 self.conn.execute(sql_update) 8668 8669 # Remove added columns 8670 for added_column in added_columns: 8671 self.drop_column(column=added_column) 8672 8673 # Delete dataframe 8674 del dataframe_trio 8675 gc.collect() 8676 8677 def calculation_vaf_normalization(self) -> None: 8678 """ 8679 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 8680 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 8681 :return: The function does not return anything. 8682 """ 8683 8684 # if FORMAT and samples 8685 if ( 8686 "FORMAT" in self.get_header_columns_as_list() 8687 and self.get_header_sample_list() 8688 ): 8689 8690 # vaf_normalization annotation field 8691 vaf_normalization_tag = "VAF" 8692 8693 # VCF infos tags 8694 vcf_infos_tags = { 8695 "VAF": "VAF Variant Frequency", 8696 } 8697 8698 # Prefix 8699 prefix = self.get_explode_infos_prefix() 8700 8701 # Variants table 8702 table_variants = self.get_table_variants() 8703 8704 # Header 8705 vcf_reader = self.get_header() 8706 8707 # Do not calculate if VAF already exists 8708 if "VAF" in vcf_reader.formats: 8709 log.debug("VAF already on genotypes") 8710 return 8711 8712 # Create variant id 8713 variant_id_column = self.get_variant_id_column() 8714 added_columns = [variant_id_column] 8715 8716 # variant_id, FORMAT and samples 8717 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8718 self.get_header_sample_list() 8719 ) 8720 8721 # Create dataframe 8722 dataframe_vaf_normalization = self.get_query_to_df( 8723 f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 8724 ) 8725 8726 vaf_normalization_set = [] 8727 8728 # for each sample vaf_normalization 8729 for sample in self.get_header_sample_list(): 8730 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 8731 lambda row: vaf_normalization(row, sample=sample), axis=1 8732 ) 8733 vaf_normalization_set.append( 8734 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 8735 ) 8736 8737 # Add VAF to FORMAT 8738 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 8739 "FORMAT" 8740 ].apply(lambda x: str(x) + ":VAF") 8741 vaf_normalization_set.append( 8742 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 8743 ) 8744 8745 # Add vaf_normalization to header 8746 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 8747 id=vaf_normalization_tag, 8748 num="1", 8749 type="Float", 8750 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 8751 type_code=self.code_type_map.get("Float"), 8752 ) 8753 8754 # Create fields to add in INFO 8755 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 8756 8757 # Update 8758 sql_update = f""" 8759 UPDATE {table_variants} 8760 SET {sql_vaf_normalization_set} 8761 FROM dataframe_vaf_normalization 8762 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 8763 8764 """ 8765 self.conn.execute(sql_update) 8766 8767 # Remove added columns 8768 for added_column in added_columns: 8769 self.drop_column(column=added_column) 8770 8771 # Delete dataframe 8772 del dataframe_vaf_normalization 8773 gc.collect() 8774 8775 def calculation_genotype_stats(self, info: str = "VAF") -> None: 8776 """ 8777 The `calculation_genotype_stats` function calculates genotype statistics for a given information 8778 field in a VCF file and updates the INFO column of the variants table with the calculated 8779 statistics. 8780 8781 :param info: The `info` parameter is a string that represents the type of information for which 8782 genotype statistics are calculated. It is used to generate various VCF info tags for the 8783 statistics, such as the number of occurrences, the list of values, the minimum value, the 8784 maximum value, the mean, the median, defaults to VAF 8785 :type info: str (optional) 8786 """ 8787 8788 # if FORMAT and samples 8789 if ( 8790 "FORMAT" in self.get_header_columns_as_list() 8791 and self.get_header_sample_list() 8792 ): 8793 8794 # vaf_stats annotation field 8795 vaf_stats_tag = info + "_stats" 8796 8797 # VCF infos tags 8798 vcf_infos_tags = { 8799 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 8800 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 8801 info + "_stats_min": f"genotype {info} Statistics - min {info}", 8802 info + "_stats_max": f"genotype {info} Statistics - max {info}", 8803 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 8804 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 8805 info 8806 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 8807 } 8808 8809 # Prefix 8810 prefix = self.get_explode_infos_prefix() 8811 8812 # Field 8813 vaf_stats_infos = prefix + vaf_stats_tag 8814 8815 # Variants table 8816 table_variants = self.get_table_variants() 8817 8818 # Header 8819 vcf_reader = self.get_header() 8820 8821 # Create variant id 8822 variant_id_column = self.get_variant_id_column() 8823 added_columns = [variant_id_column] 8824 8825 # variant_id, FORMAT and samples 8826 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8827 self.get_header_sample_list() 8828 ) 8829 8830 # Create dataframe 8831 dataframe_vaf_stats = self.get_query_to_df( 8832 f""" SELECT {samples_fields} FROM {table_variants} """ 8833 ) 8834 8835 # Create vaf_stats column 8836 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 8837 lambda row: genotype_stats( 8838 row, samples=self.get_header_sample_list(), info=info 8839 ), 8840 axis=1, 8841 ) 8842 8843 # List of vcf tags 8844 sql_vaf_stats_fields = [] 8845 8846 # Check all VAF stats infos 8847 for stat in vcf_infos_tags: 8848 8849 # Extract stats 8850 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 8851 lambda x: dict(x).get(stat, "") 8852 ) 8853 8854 # Add snpeff_hgvs to header 8855 vcf_reader.infos[stat] = vcf.parser._Info( 8856 stat, 8857 ".", 8858 "String", 8859 vcf_infos_tags.get(stat, "genotype statistics"), 8860 "howard calculation", 8861 "0", 8862 self.code_type_map.get("String"), 8863 ) 8864 8865 if len(sql_vaf_stats_fields): 8866 sep = ";" 8867 else: 8868 sep = "" 8869 8870 # Create fields to add in INFO 8871 sql_vaf_stats_fields.append( 8872 f""" 8873 CASE 8874 WHEN dataframe_vaf_stats."{stat}" NOT NULL 8875 THEN concat( 8876 '{sep}{stat}=', 8877 dataframe_vaf_stats."{stat}" 8878 ) 8879 ELSE '' 8880 END 8881 """ 8882 ) 8883 8884 # SQL set for update 8885 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 8886 8887 # Update 8888 sql_update = f""" 8889 UPDATE variants 8890 SET "INFO" = 8891 concat( 8892 CASE 8893 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8894 THEN '' 8895 ELSE concat("INFO", ';') 8896 END, 8897 {sql_vaf_stats_fields_set} 8898 ) 8899 FROM dataframe_vaf_stats 8900 WHERE variants."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 8901 8902 """ 8903 self.conn.execute(sql_update) 8904 8905 # Remove added columns 8906 for added_column in added_columns: 8907 self.drop_column(column=added_column) 8908 8909 # Delete dataframe 8910 del dataframe_vaf_stats 8911 gc.collect()
34class Variants: 35 36 def __init__( 37 self, 38 conn=None, 39 input: str = None, 40 output: str = None, 41 config: dict = {}, 42 param: dict = {}, 43 load: bool = False, 44 ) -> None: 45 """ 46 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 47 header 48 49 :param conn: the connection to the database 50 :param input: the input file 51 :param output: the output file 52 :param config: a dictionary containing the configuration of the model 53 :param param: a dictionary containing the parameters of the model 54 """ 55 56 # Init variables 57 self.init_variables() 58 59 # Input 60 self.set_input(input) 61 62 # Config 63 self.set_config(config) 64 65 # Param 66 self.set_param(param) 67 68 # Output 69 self.set_output(output) 70 71 # connexion 72 self.set_connexion(conn) 73 74 # Header 75 self.set_header() 76 77 # Load data 78 if load: 79 self.load_data() 80 81 def set_input(self, input: str = None) -> None: 82 """ 83 The function takes a file name as input, splits the file name into a name and an extension, and 84 then sets the input_name, input_extension, and input_format attributes of the class 85 86 :param input: The input file 87 """ 88 89 if input and not isinstance(input, str): 90 try: 91 self.input = input.name 92 except: 93 log.error(f"Input file '{input} in bad format") 94 raise ValueError(f"Input file '{input} in bad format") 95 else: 96 self.input = input 97 98 # Input format 99 if input: 100 input_name, input_extension = os.path.splitext(self.input) 101 self.input_name = input_name 102 self.input_extension = input_extension 103 self.input_format = self.input_extension.replace(".", "") 104 105 def set_config(self, config: dict) -> None: 106 """ 107 This function takes in a config object and sets it as the config object for the class 108 109 :param config: The configuration object 110 """ 111 self.config = config 112 113 def set_param(self, param: dict) -> None: 114 """ 115 This function takes in a param object and sets it as the param object for the class 116 117 :param param: The paramters object 118 """ 119 self.param = param 120 121 def init_variables(self) -> None: 122 """ 123 This function initializes the variables that will be used in the rest of the class 124 """ 125 self.prefix = "howard" 126 self.table_variants = "variants" 127 self.dataframe = None 128 129 self.comparison_map = { 130 "gt": ">", 131 "gte": ">=", 132 "lt": "<", 133 "lte": "<=", 134 "equals": "=", 135 "contains": "SIMILAR TO", 136 } 137 138 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 139 140 self.code_type_map_to_sql = { 141 "Integer": "INTEGER", 142 "String": "VARCHAR", 143 "Float": "FLOAT", 144 "Flag": "VARCHAR", 145 } 146 147 self.index_additionnal_fields = [] 148 149 def get_indexing(self) -> bool: 150 """ 151 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 152 returns False. 153 :return: The value of the indexing parameter. 154 """ 155 return self.get_param().get("indexing", False) 156 157 def get_connexion_config(self) -> dict: 158 """ 159 The function `get_connexion_config` returns a dictionary containing the configuration for a 160 connection, including the number of threads and memory limit. 161 :return: a dictionary containing the configuration for the Connexion library. 162 """ 163 164 # config 165 config = self.get_config() 166 167 # Connexion config 168 connexion_config = {} 169 threads = self.get_threads() 170 171 # Threads 172 if threads: 173 connexion_config["threads"] = threads 174 175 # Memory 176 # if config.get("memory", None): 177 # connexion_config["memory_limit"] = config.get("memory") 178 if self.get_memory(): 179 connexion_config["memory_limit"] = self.get_memory() 180 181 # Temporary directory 182 if config.get("tmp", None): 183 connexion_config["temp_directory"] = config.get("tmp") 184 185 # Access 186 if config.get("access", None): 187 access = config.get("access") 188 if access in ["RO"]: 189 access = "READ_ONLY" 190 elif access in ["RW"]: 191 access = "READ_WRITE" 192 connexion_db = self.get_connexion_db() 193 if connexion_db in ":memory:": 194 access = "READ_WRITE" 195 connexion_config["access_mode"] = access 196 197 return connexion_config 198 199 def get_duckdb_settings(self) -> dict: 200 """ 201 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 202 string. 203 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 204 """ 205 206 # config 207 config = self.get_config() 208 209 # duckdb settings 210 duckdb_settings_dict = {} 211 if config.get("duckdb_settings", None): 212 duckdb_settings = config.get("duckdb_settings") 213 duckdb_settings = full_path(duckdb_settings) 214 # duckdb setting is a file 215 if os.path.exists(duckdb_settings): 216 with open(duckdb_settings) as json_file: 217 duckdb_settings_dict = yaml.safe_load(json_file) 218 # duckdb settings is a string 219 else: 220 duckdb_settings_dict = json.loads(duckdb_settings) 221 222 return duckdb_settings_dict 223 224 def set_connexion_db(self) -> str: 225 """ 226 The function `set_connexion_db` returns the appropriate database connection string based on the 227 input format and connection type. 228 :return: the value of the variable `connexion_db`. 229 """ 230 231 # Default connexion db 232 default_connexion_db = ":memory:" 233 234 # Find connexion db 235 if self.get_input_format() in ["db", "duckdb"]: 236 connexion_db = self.get_input() 237 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 238 connexion_db = default_connexion_db 239 elif self.get_connexion_type() in ["tmpfile"]: 240 tmp_name = tempfile.mkdtemp( 241 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 242 ) 243 connexion_db = f"{tmp_name}/tmp.db" 244 elif self.get_connexion_type() != "": 245 connexion_db = self.get_connexion_type() 246 else: 247 connexion_db = default_connexion_db 248 249 # Set connexion db 250 self.connexion_db = connexion_db 251 252 return connexion_db 253 254 def set_connexion(self, conn) -> None: 255 """ 256 It creates a connection to the database 257 258 :param conn: The connection to the database. If not provided, a new connection to an in-memory 259 database is created 260 """ 261 262 # Connexion db 263 connexion_db = self.set_connexion_db() 264 265 # Connexion config 266 connexion_config = self.get_connexion_config() 267 268 # Connexion format 269 connexion_format = self.get_config().get("connexion_format", "duckdb") 270 # Set connexion format 271 self.connexion_format = connexion_format 272 273 # Connexion 274 if not conn: 275 if connexion_format in ["duckdb"]: 276 conn = duckdb.connect(connexion_db, config=connexion_config) 277 # duckDB settings 278 duckdb_settings = self.get_duckdb_settings() 279 if duckdb_settings: 280 for setting in duckdb_settings: 281 setting_value = duckdb_settings.get(setting) 282 if isinstance(setting_value, str): 283 setting_value = f"'{setting_value}'" 284 conn.execute(f"PRAGMA {setting}={setting_value};") 285 elif connexion_format in ["sqlite"]: 286 conn = sqlite3.connect(connexion_db) 287 288 # Set connexion 289 self.conn = conn 290 291 # Log 292 log.debug(f"connexion_format: {connexion_format}") 293 log.debug(f"connexion_db: {connexion_db}") 294 log.debug(f"connexion config: {connexion_config}") 295 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}") 296 297 def set_output(self, output: str = None) -> None: 298 """ 299 If the config file has an output key, set the output to the value of that key. Otherwise, set 300 the output to the input 301 302 :param output: The name of the output file 303 """ 304 305 if output and not isinstance(output, str): 306 self.output = output.name 307 else: 308 self.output = output 309 310 # Output format 311 if self.output: 312 output_name, output_extension = os.path.splitext(self.output) 313 self.output_name = output_name 314 self.output_extension = output_extension 315 self.output_format = self.output_extension.replace(".", "") 316 else: 317 self.output_name = None 318 self.output_extension = None 319 self.output_format = None 320 321 def set_header(self) -> None: 322 """ 323 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 324 """ 325 326 input_file = self.get_input() 327 default_header_list = [ 328 "##fileformat=VCFv4.2", 329 "#CHROM POS ID REF ALT QUAL FILTER INFO", 330 ] 331 332 # Full path 333 input_file = full_path(input_file) 334 335 if input_file: 336 337 input_format = self.get_input_format() 338 input_compressed = self.get_input_compressed() 339 config = self.get_config() 340 header_list = default_header_list 341 if input_format in [ 342 "vcf", 343 "hdr", 344 "tsv", 345 "csv", 346 "psv", 347 "parquet", 348 "db", 349 "duckdb", 350 ]: 351 # header provided in param 352 if config.get("header_file", None): 353 with open(config.get("header_file"), "rt") as f: 354 header_list = self.read_vcf_header(f) 355 # within a vcf file format (header within input file itsself) 356 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 357 # within a compressed vcf file format (.vcf.gz) 358 if input_compressed: 359 with bgzf.open(input_file, "rt") as f: 360 header_list = self.read_vcf_header(f) 361 # within an uncompressed vcf file format (.vcf) 362 else: 363 with open(input_file, "rt") as f: 364 header_list = self.read_vcf_header(f) 365 # header provided in default external file .hdr 366 elif os.path.exists((input_file + ".hdr")): 367 with open(input_file + ".hdr", "rt") as f: 368 header_list = self.read_vcf_header(f) 369 else: 370 try: # Try to get header info fields and file columns 371 372 with tempfile.TemporaryDirectory() as tmpdir: 373 374 # Create database 375 db_for_header = Database(database=input_file) 376 377 # Get header columns for infos fields 378 db_header_from_columns = ( 379 db_for_header.get_header_from_columns() 380 ) 381 382 # Get real columns in the file 383 db_header_columns = db_for_header.get_columns() 384 385 # Write header file 386 header_file_tmp = os.path.join(tmpdir, "header") 387 f = open(header_file_tmp, "w") 388 vcf.Writer(f, db_header_from_columns) 389 f.close() 390 391 # Replace #CHROM line with rel columns 392 header_list = db_for_header.read_header_file( 393 header_file=header_file_tmp 394 ) 395 header_list[-1] = "\t".join(db_header_columns) 396 397 except: 398 399 log.warning( 400 f"No header for file {input_file}. Set as default VCF header" 401 ) 402 header_list = default_header_list 403 404 else: # try for unknown format ? 405 406 log.error(f"Input file format '{input_format}' not available") 407 raise ValueError(f"Input file format '{input_format}' not available") 408 409 if not header_list: 410 header_list = default_header_list 411 412 # header as list 413 self.header_list = header_list 414 415 # header as VCF object 416 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 417 418 else: 419 420 self.header_list = None 421 self.header_vcf = None 422 423 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 424 """ 425 > The function `get_query_to_df` takes a query as a string and returns a pandas dataframe 426 427 :param query: str = "" 428 :type query: str 429 :return: A dataframe 430 """ 431 432 # Connexion format 433 connexion_format = self.get_connexion_format() 434 435 # Limit in query 436 if limit: 437 pd.set_option("display.max_rows", limit) 438 if connexion_format in ["duckdb"]: 439 df = ( 440 self.conn.execute(query) 441 .fetch_record_batch(limit) 442 .read_next_batch() 443 .to_pandas() 444 ) 445 elif connexion_format in ["sqlite"]: 446 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 447 448 # Full query 449 else: 450 if connexion_format in ["duckdb"]: 451 df = self.conn.execute(query).df() 452 elif connexion_format in ["sqlite"]: 453 df = pd.read_sql_query(query, self.conn) 454 455 return df 456 457 def get_overview(self) -> None: 458 """ 459 The function prints the input, output, config, and dataframe of the current object 460 """ 461 table_variants_from = self.get_table_variants(clause="from") 462 sql_columns = self.get_header_columns_as_sql() 463 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 464 df = self.get_query_to_df(sql_query_export) 465 log.info( 466 "Input: " 467 + str(self.get_input()) 468 + " [" 469 + str(str(self.get_input_format())) 470 + "]" 471 ) 472 log.info( 473 "Output: " 474 + str(self.get_output()) 475 + " [" 476 + str(str(self.get_output_format())) 477 + "]" 478 ) 479 log.info("Config: ") 480 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 481 "\n" 482 ): 483 log.info("\t" + str(d)) 484 log.info("Param: ") 485 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 486 "\n" 487 ): 488 log.info("\t" + str(d)) 489 log.info("Sample list: " + str(self.get_header_sample_list())) 490 log.info("Dataframe: ") 491 for d in str(df).split("\n"): 492 log.info("\t" + str(d)) 493 494 # garbage collector 495 del df 496 gc.collect() 497 498 return None 499 500 def get_stats(self) -> dict: 501 """ 502 The `get_stats` function calculates and returns various statistics of the current object, 503 including information about the input file, variants, samples, header fields, quality, and 504 SNVs/InDels. 505 :return: a dictionary containing various statistics of the current object. The dictionary has 506 the following structure: 507 """ 508 509 # Log 510 log.info(f"Stats Calculation...") 511 512 # table varaints 513 table_variants_from = self.get_table_variants() 514 515 # stats dict 516 stats = {"Infos": {}} 517 518 ### File 519 input_file = self.get_input() 520 stats["Infos"]["Input file"] = input_file 521 522 # Header 523 header_infos = self.get_header().infos 524 header_formats = self.get_header().formats 525 header_infos_list = list(header_infos) 526 header_formats_list = list(header_formats) 527 528 ### Variants 529 530 stats["Variants"] = {} 531 532 # Variants by chr 533 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 534 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 535 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 536 by=["CHROM"], kind="quicksort" 537 ) 538 539 # Total number of variants 540 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 541 542 # Calculate percentage 543 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 544 lambda x: (x / nb_of_variants) 545 ) 546 547 stats["Variants"]["Number of variants by chromosome"] = ( 548 nb_of_variants_by_chrom.to_dict(orient="index") 549 ) 550 551 stats["Infos"]["Number of variants"] = int(nb_of_variants) 552 553 ### Samples 554 555 # Init 556 samples = {} 557 nb_of_samples = 0 558 559 # Check Samples 560 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 561 log.debug(f"Check samples...") 562 for sample in self.get_header_sample_list(): 563 sql_query_samples = f""" 564 SELECT '{sample}' as sample, 565 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 566 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 567 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 568 FROM {table_variants_from} 569 WHERE ( 570 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 571 AND 572 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 573 ) 574 GROUP BY genotype 575 """ 576 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 577 sample_genotype_count = sql_query_genotype_df["count"].sum() 578 if len(sql_query_genotype_df): 579 nb_of_samples += 1 580 samples[f"{sample} - {sample_genotype_count} variants"] = ( 581 sql_query_genotype_df.to_dict(orient="index") 582 ) 583 584 stats["Samples"] = samples 585 stats["Infos"]["Number of samples"] = nb_of_samples 586 587 # # 588 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 589 # stats["Infos"]["Number of samples"] = nb_of_samples 590 # elif nb_of_samples: 591 # stats["Infos"]["Number of samples"] = "not a VCF format" 592 593 ### INFO and FORMAT fields 594 header_types_df = {} 595 header_types_list = { 596 "List of INFO fields": header_infos, 597 "List of FORMAT fields": header_formats, 598 } 599 i = 0 600 for header_type in header_types_list: 601 602 header_type_infos = header_types_list.get(header_type) 603 header_infos_dict = {} 604 605 for info in header_type_infos: 606 607 i += 1 608 header_infos_dict[i] = {} 609 610 # ID 611 header_infos_dict[i]["id"] = info 612 613 # num 614 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 615 if header_type_infos[info].num in genotype_map.keys(): 616 header_infos_dict[i]["Number"] = genotype_map.get( 617 header_type_infos[info].num 618 ) 619 else: 620 header_infos_dict[i]["Number"] = header_type_infos[info].num 621 622 # type 623 if header_type_infos[info].type: 624 header_infos_dict[i]["Type"] = header_type_infos[info].type 625 else: 626 header_infos_dict[i]["Type"] = "." 627 628 # desc 629 if header_type_infos[info].desc != None: 630 header_infos_dict[i]["Description"] = header_type_infos[info].desc 631 else: 632 header_infos_dict[i]["Description"] = "" 633 634 if len(header_infos_dict): 635 header_types_df[header_type] = pd.DataFrame.from_dict( 636 header_infos_dict, orient="index" 637 ).to_dict(orient="index") 638 639 # Stats 640 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 641 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 642 stats["Header"] = header_types_df 643 644 ### QUAL 645 if "QUAL" in self.get_header_columns(): 646 sql_query_qual = f""" 647 SELECT 648 avg(CAST(QUAL AS INTEGER)) AS Average, 649 min(CAST(QUAL AS INTEGER)) AS Minimum, 650 max(CAST(QUAL AS INTEGER)) AS Maximum, 651 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 652 median(CAST(QUAL AS INTEGER)) AS Median, 653 variance(CAST(QUAL AS INTEGER)) AS Variance 654 FROM {table_variants_from} 655 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 656 """ 657 658 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 659 stats["Quality"] = {"Stats": qual} 660 661 ### SNV and InDel 662 663 sql_query_snv = f""" 664 665 SELECT Type, count FROM ( 666 667 SELECT 668 'Total' AS Type, 669 count(*) AS count 670 FROM {table_variants_from} 671 672 UNION 673 674 SELECT 675 'MNV' AS Type, 676 count(*) AS count 677 FROM {table_variants_from} 678 WHERE len(REF) > 1 AND len(ALT) > 1 679 AND len(REF) = len(ALT) 680 681 UNION 682 683 SELECT 684 'InDel' AS Type, 685 count(*) AS count 686 FROM {table_variants_from} 687 WHERE len(REF) > 1 OR len(ALT) > 1 688 AND len(REF) != len(ALT) 689 690 UNION 691 692 SELECT 693 'SNV' AS Type, 694 count(*) AS count 695 FROM {table_variants_from} 696 WHERE len(REF) = 1 AND len(ALT) = 1 697 698 ) 699 700 ORDER BY count DESC 701 702 """ 703 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 704 705 sql_query_snv_substitution = f""" 706 SELECT 707 concat(REF, '>', ALT) AS 'Substitution', 708 count(*) AS count 709 FROM {table_variants_from} 710 WHERE len(REF) = 1 AND len(ALT) = 1 711 GROUP BY REF, ALT 712 ORDER BY count(*) DESC 713 """ 714 snv_substitution = ( 715 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 716 ) 717 stats["Variants"]["Counts"] = snv_indel 718 stats["Variants"]["Substitutions"] = snv_substitution 719 720 return stats 721 722 def stats_to_file(self, file: str = None) -> str: 723 """ 724 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 725 into a JSON object, and writes the JSON object to the specified file. 726 727 :param file: The `file` parameter is a string that represents the file path where the JSON data 728 will be written 729 :type file: str 730 :return: the name of the file that was written to. 731 """ 732 733 # Get stats 734 stats = self.get_stats() 735 736 # Serializing json 737 json_object = json.dumps(stats, indent=4) 738 739 # Writing to sample.json 740 with open(file, "w") as outfile: 741 outfile.write(json_object) 742 743 return file 744 745 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 746 """ 747 The `print_stats` function generates a markdown file and prints the statistics contained in a 748 JSON file in a formatted manner. 749 750 :param output_file: The `output_file` parameter is a string that specifies the path and filename 751 of the output file where the stats will be printed in Markdown format. If no `output_file` is 752 provided, a temporary directory will be created and the stats will be saved in a file named 753 "stats.md" within that 754 :type output_file: str 755 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 756 file where the statistics will be saved. If no value is provided, a temporary directory will be 757 created and a default file name "stats.json" will be used 758 :type json_file: str 759 :return: The function `print_stats` does not return any value. It has a return type annotation 760 of `None`. 761 """ 762 763 # Full path 764 output_file = full_path(output_file) 765 json_file = full_path(json_file) 766 767 with tempfile.TemporaryDirectory() as tmpdir: 768 769 # Files 770 if not output_file: 771 output_file = os.path.join(tmpdir, "stats.md") 772 if not json_file: 773 json_file = os.path.join(tmpdir, "stats.json") 774 775 # Create folders 776 if not os.path.exists(os.path.dirname(output_file)): 777 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 778 if not os.path.exists(os.path.dirname(json_file)): 779 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 780 781 # Create stats JSON file 782 stats_file = self.stats_to_file(file=json_file) 783 784 # Print stats file 785 with open(stats_file) as f: 786 stats = yaml.safe_load(f) 787 788 # Output 789 output_title = [] 790 output_index = [] 791 output = [] 792 793 # Title 794 output_title.append("# HOWARD Stats") 795 796 # Index 797 output_index.append("## Index") 798 799 # Process sections 800 for section in stats: 801 infos = stats.get(section) 802 section_link = "#" + section.lower().replace(" ", "-") 803 output.append(f"## {section}") 804 output_index.append(f"- [{section}]({section_link})") 805 806 if len(infos): 807 for info in infos: 808 try: 809 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 810 is_df = True 811 except: 812 try: 813 df = pd.DataFrame.from_dict( 814 json.loads((infos.get(info))), orient="index" 815 ) 816 is_df = True 817 except: 818 is_df = False 819 if is_df: 820 output.append(f"### {info}") 821 info_link = "#" + info.lower().replace(" ", "-") 822 output_index.append(f" - [{info}]({info_link})") 823 output.append(f"{df.to_markdown(index=False)}") 824 else: 825 output.append(f"- {info}: {infos.get(info)}") 826 else: 827 output.append(f"NA") 828 829 # Write stats in markdown file 830 with open(output_file, "w") as fp: 831 for item in output_title: 832 fp.write("%s\n" % item) 833 for item in output_index: 834 fp.write("%s\n" % item) 835 for item in output: 836 fp.write("%s\n" % item) 837 838 # Output stats in markdown 839 print("") 840 print("\n\n".join(output_title)) 841 print("") 842 print("\n\n".join(output)) 843 print("") 844 845 return None 846 847 def get_input(self) -> str: 848 """ 849 It returns the value of the input variable. 850 :return: The input is being returned. 851 """ 852 return self.input 853 854 def get_input_format(self, input_file: str = None) -> str: 855 """ 856 It returns the format of the input variable. 857 :return: The format is being returned. 858 """ 859 if not input_file: 860 input_file = self.get_input() 861 input_format = get_file_format(input_file) 862 return input_format 863 864 def get_input_compressed(self, input_file: str = None) -> str: 865 """ 866 It returns the format of the input variable. 867 :return: The format is being returned. 868 """ 869 if not input_file: 870 input_file = self.get_input() 871 input_compressed = get_file_compressed(input_file) 872 return input_compressed 873 874 def get_output(self) -> str: 875 """ 876 It returns the output of the neuron. 877 :return: The output of the neural network. 878 """ 879 return self.output 880 881 def get_output_format(self, output_file: str = None) -> str: 882 """ 883 It returns the format of the input variable. 884 :return: The format is being returned. 885 """ 886 if not output_file: 887 output_file = self.get_output() 888 output_format = get_file_format(output_file) 889 890 return output_format 891 892 def get_config(self) -> dict: 893 """ 894 It returns the config 895 :return: The config variable is being returned. 896 """ 897 return self.config 898 899 def get_param(self) -> dict: 900 """ 901 It returns the param 902 :return: The param variable is being returned. 903 """ 904 return self.param 905 906 def get_connexion_db(self) -> str: 907 """ 908 It returns the connexion_db attribute of the object 909 :return: The connexion_db is being returned. 910 """ 911 return self.connexion_db 912 913 def get_prefix(self) -> str: 914 """ 915 It returns the prefix of the object. 916 :return: The prefix is being returned. 917 """ 918 return self.prefix 919 920 def get_table_variants(self, clause: str = "select") -> str: 921 """ 922 This function returns the table_variants attribute of the object 923 924 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 925 defaults to select (optional) 926 :return: The table_variants attribute of the object. 927 """ 928 929 # Access 930 access = self.get_config().get("access", None) 931 932 # Clauses "select", "where", "update" 933 if clause in ["select", "where", "update"]: 934 table_variants = self.table_variants 935 # Clause "from" 936 elif clause in ["from"]: 937 # For Read Only 938 if self.get_input_format() in ["parquet"] and access in ["RO"]: 939 input_file = self.get_input() 940 table_variants = f"'{input_file}' as variants" 941 # For Read Write 942 else: 943 table_variants = f"{self.table_variants} as variants" 944 else: 945 table_variants = self.table_variants 946 return table_variants 947 948 def get_tmp_dir(self) -> str: 949 """ 950 The function `get_tmp_dir` returns the temporary directory path based on configuration 951 parameters or a default path. 952 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 953 configuration, parameters, and a default value of "/tmp". 954 """ 955 956 return get_tmp( 957 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 958 ) 959 960 def get_connexion_type(self) -> str: 961 """ 962 If the connexion type is not in the list of allowed connexion types, raise a ValueError 963 964 :return: The connexion type is being returned. 965 """ 966 return self.get_config().get("connexion_type", "memory") 967 968 def get_connexion(self): 969 """ 970 It returns the connection object 971 972 :return: The connection object. 973 """ 974 return self.conn 975 976 def close_connexion(self) -> None: 977 """ 978 This function closes the connection to the database. 979 :return: The connection is being closed. 980 """ 981 return self.conn.close() 982 983 def get_header(self, type: str = "vcf"): 984 """ 985 This function returns the header of the VCF file as a list of strings 986 987 :param type: the type of header you want to get, defaults to vcf (optional) 988 :return: The header of the vcf file. 989 """ 990 991 if self.header_vcf: 992 if type == "vcf": 993 return self.header_vcf 994 elif type == "list": 995 return self.header_list 996 else: 997 if type == "vcf": 998 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 999 return header 1000 elif type == "list": 1001 return vcf_required 1002 1003 def get_header_length(self, file: str = None) -> int: 1004 """ 1005 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1006 line. 1007 1008 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1009 header file. If this argument is provided, the function will read the header from the specified 1010 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1011 :type file: str 1012 :return: the length of the header list, excluding the #CHROM line. 1013 """ 1014 1015 if file: 1016 return len(self.read_vcf_header_file(file=file)) - 1 1017 elif self.get_header(type="list"): 1018 return len(self.get_header(type="list")) - 1 1019 else: 1020 return 0 1021 1022 def get_header_columns(self) -> str: 1023 """ 1024 This function returns the header list of a VCF 1025 1026 :return: The length of the header list. 1027 """ 1028 if self.get_header(): 1029 return self.get_header(type="list")[-1] 1030 else: 1031 return "" 1032 1033 def get_header_columns_as_list(self) -> list: 1034 """ 1035 This function returns the header list of a VCF 1036 1037 :return: The length of the header list. 1038 """ 1039 if self.get_header(): 1040 return self.get_header_columns().strip().split("\t") 1041 else: 1042 return [] 1043 1044 def get_header_columns_as_sql(self) -> str: 1045 """ 1046 This function retruns header length (without #CHROM line) 1047 1048 :return: The length of the header list. 1049 """ 1050 sql_column_list = [] 1051 for col in self.get_header_columns_as_list(): 1052 sql_column_list.append(f'"{col}"') 1053 return ",".join(sql_column_list) 1054 1055 def get_header_sample_list(self) -> list: 1056 """ 1057 This function retruns header length (without #CHROM line) 1058 1059 :return: The length of the header list. 1060 """ 1061 return self.header_vcf.samples 1062 1063 def get_verbose(self) -> bool: 1064 """ 1065 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1066 exist 1067 1068 :return: The value of the key "verbose" in the config dictionary. 1069 """ 1070 return self.get_config().get("verbose", False) 1071 1072 def get_connexion_format(self) -> str: 1073 """ 1074 It returns the connexion format of the object. 1075 :return: The connexion_format is being returned. 1076 """ 1077 connexion_format = self.connexion_format 1078 if connexion_format not in ["duckdb", "sqlite"]: 1079 log.error(f"Unknown connexion format {connexion_format}") 1080 raise ValueError(f"Unknown connexion format {connexion_format}") 1081 else: 1082 return connexion_format 1083 1084 def insert_file_to_table( 1085 self, 1086 file, 1087 columns: str, 1088 header_len: int = 0, 1089 sep: str = "\t", 1090 chunksize: int = 1000000, 1091 ) -> None: 1092 """ 1093 The function reads a file in chunks, and inserts each chunk into a table 1094 1095 :param file: the file to be loaded 1096 :param columns: a string of the column names separated by commas 1097 :param header_len: the number of lines to skip at the beginning of the file, defaults to 0 1098 (optional) 1099 :param sep: the separator used in the file, defaults to \t (optional) 1100 :param chunksize: The number of rows to read in at a time, defaults to 1000000 (optional) 1101 """ 1102 1103 # Config 1104 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1105 connexion_format = self.get_connexion_format() 1106 1107 log.debug("chunksize: " + str(chunksize)) 1108 1109 if chunksize: 1110 for chunk in pd.read_csv( 1111 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1112 ): 1113 if connexion_format in ["duckdb"]: 1114 sql_insert_into = ( 1115 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1116 ) 1117 self.conn.execute(sql_insert_into) 1118 elif connexion_format in ["sqlite"]: 1119 chunk.to_sql("variants", self.conn, if_exists="append", index=False) 1120 1121 def load_data( 1122 self, 1123 input_file: str = None, 1124 drop_variants_table: bool = False, 1125 sample_size: int = 20480, 1126 ) -> None: 1127 """ 1128 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1129 table before loading the data and specify a sample size. 1130 1131 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1132 table 1133 :type input_file: str 1134 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1135 determines whether the variants table should be dropped before loading the data. If set to 1136 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1137 not be dropped, defaults to False 1138 :type drop_variants_table: bool (optional) 1139 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1140 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1141 20480 1142 :type sample_size: int (optional) 1143 """ 1144 1145 log.info("Loading...") 1146 1147 # change input file 1148 if input_file: 1149 self.set_input(input_file) 1150 self.set_header() 1151 1152 # drop variants table 1153 if drop_variants_table: 1154 self.drop_variants_table() 1155 1156 # get table variants 1157 table_variants = self.get_table_variants() 1158 1159 # Access 1160 access = self.get_config().get("access", None) 1161 log.debug(f"access: {access}") 1162 1163 # Input format and compress 1164 input_format = self.get_input_format() 1165 input_compressed = self.get_input_compressed() 1166 log.debug(f"input_format: {input_format}") 1167 log.debug(f"input_compressed: {input_compressed}") 1168 1169 # input_compressed_format 1170 if input_compressed: 1171 input_compressed_format = "gzip" 1172 else: 1173 input_compressed_format = "none" 1174 log.debug(f"input_compressed_format: {input_compressed_format}") 1175 1176 # Connexion format 1177 connexion_format = self.get_connexion_format() 1178 1179 # Sample size 1180 if not sample_size: 1181 sample_size = -1 1182 log.debug(f"sample_size: {sample_size}") 1183 1184 # Load data 1185 log.debug(f"Load Data from {input_format}") 1186 1187 # DuckDB connexion 1188 if connexion_format in ["duckdb"]: 1189 1190 # Database already exists 1191 if self.input_format in ["db", "duckdb"]: 1192 1193 if connexion_format in ["duckdb"]: 1194 log.debug(f"Input file format '{self.input_format}' duckDB") 1195 else: 1196 log.error( 1197 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1198 ) 1199 raise ValueError( 1200 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1201 ) 1202 1203 # Load from existing database format 1204 else: 1205 1206 try: 1207 # Create Table or View 1208 database = Database(database=self.input) 1209 sql_from = database.get_sql_from(sample_size=sample_size) 1210 1211 if access in ["RO"]: 1212 sql_load = ( 1213 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1214 ) 1215 else: 1216 sql_load = ( 1217 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1218 ) 1219 self.conn.execute(sql_load) 1220 1221 except: 1222 # Format not available 1223 log.error(f"Input file format '{self.input_format}' not available") 1224 raise ValueError( 1225 f"Input file format '{self.input_format}' not available" 1226 ) 1227 1228 # SQLite connexion 1229 elif connexion_format in ["sqlite"] and input_format in [ 1230 "vcf", 1231 "tsv", 1232 "csv", 1233 "psv", 1234 ]: 1235 1236 # Main structure 1237 structure = { 1238 "#CHROM": "VARCHAR", 1239 "POS": "INTEGER", 1240 "ID": "VARCHAR", 1241 "REF": "VARCHAR", 1242 "ALT": "VARCHAR", 1243 "QUAL": "VARCHAR", 1244 "FILTER": "VARCHAR", 1245 "INFO": "VARCHAR", 1246 } 1247 1248 # Strcuture with samples 1249 structure_complete = structure 1250 if self.get_header_sample_list(): 1251 structure["FORMAT"] = "VARCHAR" 1252 for sample in self.get_header_sample_list(): 1253 structure_complete[sample] = "VARCHAR" 1254 1255 # Columns list for create and insert 1256 sql_create_table_columns = [] 1257 sql_create_table_columns_list = [] 1258 for column in structure_complete: 1259 column_type = structure_complete[column] 1260 sql_create_table_columns.append( 1261 f'"{column}" {column_type} default NULL' 1262 ) 1263 sql_create_table_columns_list.append(f'"{column}"') 1264 1265 # Create database 1266 log.debug(f"Create Table {table_variants}") 1267 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1268 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1269 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1270 self.conn.execute(sql_create_table) 1271 1272 # chunksize define length of file chunk load file 1273 chunksize = 100000 1274 1275 # delimiter 1276 delimiter = file_format_delimiters.get(input_format, "\t") 1277 1278 # Load the input file 1279 with open(self.input, "rt") as input_file: 1280 1281 # Use the appropriate file handler based on the input format 1282 if input_compressed: 1283 input_file = bgzf.open(self.input, "rt") 1284 if input_format in ["vcf"]: 1285 header_len = self.get_header_length() 1286 else: 1287 header_len = 0 1288 1289 # Insert the file contents into a table 1290 self.insert_file_to_table( 1291 input_file, 1292 columns=sql_create_table_columns_list_sql, 1293 header_len=header_len, 1294 sep=delimiter, 1295 chunksize=chunksize, 1296 ) 1297 1298 else: 1299 log.error( 1300 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1301 ) 1302 raise ValueError( 1303 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1304 ) 1305 1306 # Explode INFOS fields into table fields 1307 if self.get_explode_infos(): 1308 self.explode_infos( 1309 prefix=self.get_explode_infos_prefix(), 1310 fields=self.get_explode_infos_fields(), 1311 force=True, 1312 ) 1313 1314 # Create index after insertion 1315 self.create_indexes() 1316 1317 def get_explode_infos(self) -> bool: 1318 """ 1319 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1320 to False if it is not set. 1321 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1322 value. If the parameter is not present, it will return False. 1323 """ 1324 1325 return self.get_param().get("explode", {}).get("explode_infos", False) 1326 1327 def get_explode_infos_fields( 1328 self, 1329 explode_infos_fields: str = None, 1330 remove_fields_not_in_header: bool = False, 1331 ) -> list: 1332 """ 1333 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1334 the input parameter `explode_infos_fields`. 1335 1336 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1337 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1338 comma-separated list of field names to explode 1339 :type explode_infos_fields: str 1340 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1341 flag that determines whether to remove fields that are not present in the header. If it is set 1342 to `True`, any field that is not in the header will be excluded from the list of exploded 1343 information fields. If it is set to `, defaults to False 1344 :type remove_fields_not_in_header: bool (optional) 1345 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1346 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1347 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1348 Otherwise, it returns a list of exploded information fields after removing any spaces and 1349 splitting the string by commas. 1350 """ 1351 1352 # If no fields, get it in param 1353 if not explode_infos_fields: 1354 explode_infos_fields = ( 1355 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1356 ) 1357 1358 # If no fields, defined as all fields in header using keyword 1359 if not explode_infos_fields: 1360 explode_infos_fields = "*" 1361 1362 # If fields list not empty 1363 if explode_infos_fields: 1364 1365 # Input fields list 1366 if isinstance(explode_infos_fields, str): 1367 fields_input = explode_infos_fields.split(",") 1368 elif isinstance(explode_infos_fields, list): 1369 fields_input = explode_infos_fields 1370 else: 1371 fields_input = [] 1372 1373 # Fields list without * keyword 1374 fields_without_all = fields_input.copy() 1375 if "*".casefold() in (item.casefold() for item in fields_without_all): 1376 fields_without_all.remove("*") 1377 1378 # Fields in header 1379 fields_in_header = sorted(list(set(self.get_header().infos))) 1380 1381 # Construct list of fields 1382 fields_output = [] 1383 for field in fields_input: 1384 1385 # Strip field 1386 field = field.strip() 1387 1388 # format keyword * in regex 1389 if field.upper() in ["*"]: 1390 field = ".*" 1391 1392 # Find all fields with pattern 1393 r = re.compile(field) 1394 fields_search = sorted(list(filter(r.match, fields_in_header))) 1395 1396 # Remove fields input from search 1397 if fields_search != [field]: 1398 fields_search = sorted( 1399 list(set(fields_search).difference(fields_input)) 1400 ) 1401 1402 # If field is not in header (avoid not well formatted header) 1403 if not fields_search and not remove_fields_not_in_header: 1404 fields_search = [field] 1405 1406 # Add found fields 1407 for new_field in fields_search: 1408 # Add field, if not already exists, and if it is in header (if asked) 1409 if ( 1410 new_field not in fields_output 1411 and ( 1412 not remove_fields_not_in_header 1413 or new_field in fields_in_header 1414 ) 1415 and new_field not in [".*"] 1416 ): 1417 fields_output.append(new_field) 1418 1419 return fields_output 1420 1421 else: 1422 1423 return [] 1424 1425 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1426 """ 1427 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1428 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1429 not provided. 1430 1431 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1432 prefix to be used for exploding or expanding information 1433 :type explode_infos_prefix: str 1434 :return: the value of the variable `explode_infos_prefix`. 1435 """ 1436 1437 if not explode_infos_prefix: 1438 explode_infos_prefix = ( 1439 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1440 ) 1441 1442 return explode_infos_prefix 1443 1444 def add_column( 1445 self, 1446 table_name, 1447 column_name, 1448 column_type, 1449 default_value=None, 1450 drop: bool = False, 1451 ) -> dict: 1452 """ 1453 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1454 doesn't already exist. 1455 1456 :param table_name: The name of the table to which you want to add a column 1457 :param column_name: The parameter "column_name" is the name of the column that you want to add 1458 to the table 1459 :param column_type: The `column_type` parameter specifies the data type of the column that you 1460 want to add to the table. It should be a string that represents the desired data type, such as 1461 "INTEGER", "TEXT", "REAL", etc 1462 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1463 default value for the newly added column. If a default value is provided, it will be assigned to 1464 the column for any existing rows that do not have a value for that column 1465 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1466 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1467 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1468 to False 1469 :type drop: bool (optional) 1470 :return: a boolean value indicating whether the column was successfully added to the table. 1471 """ 1472 1473 # added 1474 added = False 1475 dropped = False 1476 1477 # Check if the column already exists in the table 1478 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1479 columns = self.get_query_to_df(query).columns.tolist() 1480 if column_name in columns: 1481 log.debug( 1482 f"The {column_name} column already exists in the {table_name} table" 1483 ) 1484 if drop: 1485 self.drop_column(table_name=table_name, column_name=column_name) 1486 dropped = True 1487 else: 1488 return None 1489 else: 1490 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1491 1492 # Add column in table 1493 add_column_query = ( 1494 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1495 ) 1496 if default_value is not None: 1497 add_column_query += f" DEFAULT {default_value}" 1498 self.execute_query(add_column_query) 1499 added = not dropped 1500 log.debug( 1501 f"The {column_name} column was successfully added to the {table_name} table" 1502 ) 1503 1504 if added: 1505 added_column = { 1506 "table_name": table_name, 1507 "column_name": column_name, 1508 "column_type": column_type, 1509 "default_value": default_value, 1510 } 1511 else: 1512 added_column = None 1513 1514 return added_column 1515 1516 def drop_column( 1517 self, column: dict = None, table_name: str = None, column_name: str = None 1518 ) -> bool: 1519 """ 1520 The `drop_column` function drops a specified column from a given table in a database and returns 1521 True if the column was successfully dropped, and False if the column does not exist in the 1522 table. 1523 1524 :param column: The `column` parameter is a dictionary that contains information about the column 1525 you want to drop. It has two keys: 1526 :type column: dict 1527 :param table_name: The `table_name` parameter is the name of the table from which you want to 1528 drop a column 1529 :type table_name: str 1530 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1531 from the table 1532 :type column_name: str 1533 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1534 and False if the column does not exist in the table. 1535 """ 1536 1537 # Find column infos 1538 if column: 1539 if isinstance(column, dict): 1540 table_name = column.get("table_name", None) 1541 column_name = column.get("column_name", None) 1542 elif isinstance(column, str): 1543 table_name = self.get_table_variants() 1544 column_name = column 1545 else: 1546 table_name = None 1547 column_name = None 1548 1549 if not table_name and not column_name: 1550 return False 1551 1552 # Removed 1553 removed = False 1554 1555 # Check if the column already exists in the table 1556 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1557 columns = self.get_query_to_df(query).columns.tolist() 1558 if column_name in columns: 1559 log.debug(f"The {column_name} column exists in the {table_name} table") 1560 else: 1561 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1562 return False 1563 1564 # Add column in table # ALTER TABLE integers DROP k 1565 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1566 self.execute_query(add_column_query) 1567 removed = True 1568 log.debug( 1569 f"The {column_name} column was successfully dropped to the {table_name} table" 1570 ) 1571 1572 return removed 1573 1574 def explode_infos( 1575 self, 1576 prefix: str = None, 1577 create_index: bool = False, 1578 fields: list = None, 1579 force: bool = False, 1580 proccess_all_fields_together: bool = False, 1581 ) -> list: 1582 """ 1583 The `explode_infos` function takes a VCF file and explodes the INFO fields into individual 1584 columns, returning a list of added columns. 1585 1586 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1587 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1588 `self.get_explode_infos_prefix()` as the prefix 1589 :type prefix: str 1590 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1591 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1592 `False`, indexes will not be created. The default value is `False`, defaults to False 1593 :type create_index: bool (optional) 1594 :param fields: The `fields` parameter is a list of INFO fields that you want to explode into 1595 individual columns. If this parameter is not provided, all INFO fields will be exploded 1596 :type fields: list 1597 :param force: The `force` parameter is a boolean flag that determines whether to drop and 1598 recreate the column if it already exists in the table. If `force` is set to `True`, the column 1599 will be dropped and recreated. If `force` is set to `False`, the column will not be dropped, 1600 defaults to False 1601 :type force: bool (optional) 1602 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1603 flag that determines whether to process all the INFO fields together or individually. If set to 1604 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1605 be processed individually, defaults to False 1606 :type proccess_all_fields_together: bool (optional) 1607 :return: The function `explode_infos` returns a list of added columns. 1608 """ 1609 1610 # drop indexes 1611 self.drop_indexes() 1612 1613 # connexion format 1614 connexion_format = self.get_connexion_format() 1615 1616 # Access 1617 access = self.get_config().get("access", None) 1618 1619 # Added columns 1620 added_columns = [] 1621 1622 if access not in ["RO"]: 1623 1624 # prefix 1625 if prefix in [None, True] or not isinstance(prefix, str): 1626 if self.get_explode_infos_prefix() not in [None, True]: 1627 prefix = self.get_explode_infos_prefix() 1628 else: 1629 prefix = "INFO/" 1630 1631 # table variants 1632 table_variants = self.get_table_variants(clause="select") 1633 1634 # extra infos 1635 try: 1636 extra_infos = self.get_extra_infos() 1637 except: 1638 extra_infos = [] 1639 1640 # Header infos 1641 header_infos = self.get_header().infos 1642 1643 log.debug( 1644 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1645 ) 1646 1647 sql_info_alter_table_array = [] 1648 1649 # Info fields to check 1650 fields_list = list(header_infos) 1651 if fields: 1652 fields_list += fields 1653 fields_list = set(fields_list) 1654 1655 # If no fields 1656 if not fields: 1657 fields = [] 1658 1659 # Translate fields if patterns 1660 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1661 1662 for info in fields: 1663 1664 info_id_sql = prefix + info 1665 1666 if ( 1667 info in fields_list 1668 or prefix + info in fields_list 1669 or info in extra_infos 1670 ): 1671 1672 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1673 1674 if info in header_infos: 1675 info_type = header_infos[info].type 1676 info_num = header_infos[info].num 1677 else: 1678 info_type = "String" 1679 info_num = 0 1680 1681 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1682 if info_num != 1: 1683 type_sql = "VARCHAR" 1684 1685 # Add field 1686 added_column = self.add_column( 1687 table_name=table_variants, 1688 column_name=info_id_sql, 1689 column_type=type_sql, 1690 default_value="null", 1691 drop=force, 1692 ) 1693 1694 if added_column: 1695 added_columns.append(added_column) 1696 1697 if added_column or force: 1698 1699 # add field to index 1700 self.index_additionnal_fields.append(info_id_sql) 1701 1702 # Update field array 1703 if connexion_format in ["duckdb"]: 1704 update_info_field = f""" 1705 "{info_id_sql}" = 1706 CASE 1707 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1708 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1709 END 1710 """ 1711 elif connexion_format in ["sqlite"]: 1712 update_info_field = f""" 1713 "{info_id_sql}" = 1714 CASE 1715 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1716 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1717 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1718 END 1719 """ 1720 1721 sql_info_alter_table_array.append(update_info_field) 1722 1723 if sql_info_alter_table_array: 1724 1725 # By chromosomes 1726 try: 1727 chromosomes_list = list( 1728 self.get_query_to_df( 1729 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1730 )["#CHROM"] 1731 ) 1732 except: 1733 chromosomes_list = [None] 1734 1735 for chrom in chromosomes_list: 1736 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1737 1738 # Where clause 1739 where_clause = "" 1740 if chrom and len(chromosomes_list) > 1: 1741 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1742 1743 # Update table 1744 if proccess_all_fields_together: 1745 sql_info_alter_table_array_join = ", ".join( 1746 sql_info_alter_table_array 1747 ) 1748 if sql_info_alter_table_array_join: 1749 sql_info_alter_table = f""" 1750 UPDATE {table_variants} 1751 SET {sql_info_alter_table_array_join} 1752 {where_clause} 1753 """ 1754 log.debug( 1755 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1756 ) 1757 # log.debug(sql_info_alter_table) 1758 self.conn.execute(sql_info_alter_table) 1759 else: 1760 sql_info_alter_num = 0 1761 for sql_info_alter in sql_info_alter_table_array: 1762 sql_info_alter_num += 1 1763 sql_info_alter_table = f""" 1764 UPDATE {table_variants} 1765 SET {sql_info_alter} 1766 {where_clause} 1767 """ 1768 log.debug( 1769 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1770 ) 1771 # log.debug(sql_info_alter_table) 1772 self.conn.execute(sql_info_alter_table) 1773 1774 # create indexes 1775 if create_index: 1776 self.create_indexes() 1777 1778 return added_columns 1779 1780 def create_indexes(self) -> None: 1781 """ 1782 Create indexes on the table after insertion 1783 """ 1784 1785 # Access 1786 access = self.get_config().get("access", None) 1787 1788 # get table variants 1789 table_variants = self.get_table_variants("FROM") 1790 1791 if self.get_indexing() and access not in ["RO"]: 1792 # Create index 1793 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 1794 self.conn.execute(sql_create_table_index) 1795 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 1796 self.conn.execute(sql_create_table_index) 1797 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 1798 self.conn.execute(sql_create_table_index) 1799 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 1800 self.conn.execute(sql_create_table_index) 1801 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 1802 self.conn.execute(sql_create_table_index) 1803 for field in self.index_additionnal_fields: 1804 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 1805 self.conn.execute(sql_create_table_index) 1806 1807 def drop_indexes(self) -> None: 1808 """ 1809 Create indexes on the table after insertion 1810 """ 1811 1812 # Access 1813 access = self.get_config().get("access", None) 1814 1815 # get table variants 1816 table_variants = self.get_table_variants("FROM") 1817 1818 # Get database format 1819 connexion_format = self.get_connexion_format() 1820 1821 if access not in ["RO"]: 1822 if connexion_format in ["duckdb"]: 1823 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 1824 elif connexion_format in ["sqlite"]: 1825 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 1826 1827 list_indexes = self.conn.execute(sql_list_indexes) 1828 index_names = [row[0] for row in list_indexes.fetchall()] 1829 for index in index_names: 1830 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 1831 self.conn.execute(sql_drop_table_index) 1832 1833 def read_vcf_header(self, f) -> list: 1834 """ 1835 It reads the header of a VCF file and returns a list of the header lines 1836 1837 :param f: the file object 1838 :return: The header lines of the VCF file. 1839 """ 1840 1841 header_list = [] 1842 for line in f: 1843 header_list.append(line) 1844 if line.startswith("#CHROM"): 1845 break 1846 return header_list 1847 1848 def read_vcf_header_file(self, file: str = None) -> list: 1849 """ 1850 The function `read_vcf_header_file` reads the header of a VCF file, either from a compressed or 1851 uncompressed file. 1852 1853 :param file: The `file` parameter is a string that represents the path to the VCF header file 1854 that you want to read. It is an optional parameter, so if you don't provide a value, it will 1855 default to `None` 1856 :type file: str 1857 :param compressed: The `compressed` parameter is a boolean flag that indicates whether the VCF 1858 file is compressed or not. If `compressed` is set to `True`, it means that the VCF file is 1859 compressed using the BGZF compression format. If `compressed` is set to `False`, it means that, 1860 defaults to False 1861 :type compressed: bool (optional) 1862 :return: a list. 1863 """ 1864 1865 if self.get_input_compressed(input_file=file): 1866 with bgzf.open(file, "rt") as f: 1867 return self.read_vcf_header(f=f) 1868 else: 1869 with open(file, "rt") as f: 1870 return self.read_vcf_header(f=f) 1871 1872 def execute_query(self, query: str): 1873 """ 1874 It takes a query as an argument, executes it, and returns the results 1875 1876 :param query: The query to be executed 1877 :return: The result of the query is being returned. 1878 """ 1879 if query: 1880 return self.conn.execute(query) # .fetchall() 1881 else: 1882 return None 1883 1884 def export_output( 1885 self, 1886 output_file: str | None = None, 1887 output_header: str | None = None, 1888 export_header: bool = True, 1889 query: str | None = None, 1890 parquet_partitions: list | None = None, 1891 chunk_size: int | None = None, 1892 threads: int | None = None, 1893 sort: bool = False, 1894 index: bool = False, 1895 order_by: str | None = None, 1896 ) -> bool: 1897 """ 1898 The `export_output` function exports data from a VCF file to a specified output file in various 1899 formats, including VCF, CSV, TSV, PSV, and Parquet. 1900 1901 :param output_file: The `output_file` parameter is a string that specifies the name of the 1902 output file to be generated by the function. This is where the exported data will be saved 1903 :type output_file: str 1904 :param output_header: The `output_header` parameter is a string that specifies the name of the 1905 file where the header of the VCF file will be exported. If this parameter is not provided, the 1906 header will be exported to a file with the same name as the `output_file` parameter, but with 1907 the extension " 1908 :type output_header: str 1909 :param export_header: The `export_header` parameter is a boolean flag that determines whether 1910 the header of a VCF file should be exported to a separate file or not. If `export_header` is 1911 True, the header will be exported to a file. If `export_header` is False, the header will not 1912 be, defaults to True, if output format is not VCF 1913 :type export_header: bool (optional) 1914 :param query: The `query` parameter is an optional SQL query that can be used to filter and 1915 select specific data from the VCF file before exporting it. If provided, only the data that 1916 matches the query will be exported 1917 :type query: str 1918 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 1919 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 1920 organize data in a hierarchical directory structure based on the values of one or more columns. 1921 This can improve query performance when working with large datasets 1922 :type parquet_partitions: list 1923 :param chunk_size: The `chunk_size` parameter specifies the number of 1924 records in batch when exporting data in Parquet format. This parameter is used for 1925 partitioning the Parquet file into multiple files. 1926 :type chunk_size: int 1927 :param threads: The `threads` parameter is an optional parameter that specifies the number of 1928 threads to be used during the export process. It determines the level of parallelism and can 1929 improve the performance of the export operation. If not provided, the function will use the 1930 default number of threads 1931 :type threads: int 1932 :param sort: The `sort` parameter is a boolean flag that determines whether the output file 1933 should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the 1934 genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to 1935 False 1936 :type sort: bool (optional) 1937 :param index: The `index` parameter is a boolean flag that determines whether an index should be 1938 created on the output file. If `index` is True, an index will be created. If `index` is False, 1939 no index will be created. The default value is False, defaults to False 1940 :type index: bool (optional) 1941 :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for 1942 sorting the output file. This parameter is only applicable when exporting data in VCF format 1943 :type order_by: str 1944 :return: a boolean value. It checks if the output file exists and returns True if it does, or 1945 None if it doesn't. 1946 """ 1947 1948 # Log 1949 log.info("Exporting...") 1950 1951 # Full path 1952 output_file = full_path(output_file) 1953 output_header = full_path(output_header) 1954 1955 # Config 1956 config = self.get_config() 1957 1958 # Param 1959 param = self.get_param() 1960 1961 # Tmp files to remove 1962 tmp_to_remove = [] 1963 1964 # If no output, get it 1965 if not output_file: 1966 output_file = self.get_output() 1967 1968 # If not threads 1969 if not threads: 1970 threads = self.get_threads() 1971 1972 # Auto header name with extension 1973 if export_header or output_header: 1974 if not output_header: 1975 output_header = f"{output_file}.hdr" 1976 # Export header 1977 self.export_header(output_file=output_file) 1978 1979 # Switch off export header if VCF output 1980 output_file_type = get_file_format(output_file) 1981 if output_file_type in ["vcf"]: 1982 export_header = False 1983 tmp_to_remove.append(output_header) 1984 1985 # Chunk size 1986 if not chunk_size: 1987 chunk_size = config.get("chunk_size", None) 1988 1989 # Parquet partition 1990 if not parquet_partitions: 1991 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 1992 if parquet_partitions and isinstance(parquet_partitions, str): 1993 parquet_partitions = parquet_partitions.split(",") 1994 1995 # Order by 1996 if not order_by: 1997 order_by = param.get("export", {}).get("order_by", "") 1998 1999 # Header in output 2000 header_in_output = param.get("export", {}).get("include_header", False) 2001 2002 # Database 2003 database_source = self.get_connexion() 2004 2005 # Connexion format 2006 connexion_format = self.get_connexion_format() 2007 2008 # Explode infos 2009 if self.get_explode_infos(): 2010 self.explode_infos( 2011 prefix=self.get_explode_infos_prefix(), 2012 fields=self.get_explode_infos_fields(), 2013 force=False, 2014 ) 2015 2016 # if connexion_format in ["sqlite"] or query: 2017 if connexion_format in ["sqlite"]: 2018 2019 # Export in Parquet 2020 random_tmp = "".join( 2021 random.choice(string.ascii_lowercase) for i in range(10) 2022 ) 2023 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2024 tmp_to_remove.append(database_source) 2025 2026 # Table Variants 2027 table_variants = self.get_table_variants() 2028 2029 # Create export query 2030 sql_query_export_subquery = f""" 2031 SELECT * FROM {table_variants} 2032 """ 2033 2034 # Write source file 2035 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2036 2037 # Create database 2038 database = Database( 2039 database=database_source, 2040 table="variants", 2041 header_file=output_header, 2042 conn_config=self.get_connexion_config(), 2043 ) 2044 2045 # Existing colomns header 2046 # existing_columns_header = database.get_header_file_columns(output_header) 2047 existing_columns_header = database.get_header_columns_from_database() 2048 2049 # Export file 2050 database.export( 2051 output_database=output_file, 2052 output_header=output_header, 2053 existing_columns_header=existing_columns_header, 2054 parquet_partitions=parquet_partitions, 2055 chunk_size=chunk_size, 2056 threads=threads, 2057 sort=sort, 2058 index=index, 2059 header_in_output=header_in_output, 2060 order_by=order_by, 2061 query=query, 2062 export_header=export_header, 2063 ) 2064 2065 # Remove 2066 remove_if_exists(tmp_to_remove) 2067 2068 return (os.path.exists(output_file) or None) and ( 2069 os.path.exists(output_file) or None 2070 ) 2071 2072 def get_extra_infos(self, table: str = None) -> list: 2073 """ 2074 > This function returns a list of columns that are in the table but not in the header 2075 2076 The function is called `get_extra_infos` and it takes two arguments: `self` and `table`. The 2077 `self` argument is a reference to the object that called the function. The `table` argument is 2078 the name of the table that we want to get the extra columns from 2079 2080 :param table: The table to get the extra columns from. If not specified, it will use the 2081 variants table 2082 :param format: The format of the output. If it's "sql", it will return a string of the extra 2083 columns separated by commas. If it's "list", it will return a list of the extra columns 2084 :return: A list of columns that are in the table but not in the header 2085 """ 2086 2087 header_columns = [] 2088 2089 if not table: 2090 table = self.get_table_variants(clause="from") 2091 header_columns = self.get_header_columns() 2092 2093 # Check all columns in the database 2094 query = f""" SELECT * FROM {table} LIMIT 1 """ 2095 log.debug(f"query {query}") 2096 table_columns = self.get_query_to_df(query).columns.tolist() 2097 extra_columns = [] 2098 2099 # Construct extra infos (not in header) 2100 for column in table_columns: 2101 if column not in header_columns: 2102 extra_columns.append(column) 2103 2104 return extra_columns 2105 2106 def get_extra_infos_sql(self, table: str = None) -> str: 2107 """ 2108 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2109 by double quotes 2110 2111 :param table: The name of the table to get the extra infos from. If None, the default table is 2112 used 2113 :type table: str 2114 :return: A string of the extra infos 2115 """ 2116 2117 return ", ".join( 2118 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2119 ) 2120 2121 def export_header( 2122 self, 2123 header_name: str = None, 2124 output_file: str = None, 2125 output_file_ext: str = ".hdr", 2126 clean_header: bool = True, 2127 remove_chrom_line: bool = False, 2128 ) -> str: 2129 """ 2130 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2131 specified options, and writes it to a new file. 2132 2133 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2134 this parameter is not specified, the header will be written to the output file 2135 :type header_name: str 2136 :param output_file: The `output_file` parameter in the `export_header` function is used to 2137 specify the name of the output file where the header will be written. If this parameter is not 2138 provided, the header will be written to a temporary file 2139 :type output_file: str 2140 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2141 string that represents the extension of the output header file. By default, it is set to ".hdr" 2142 if not specified by the user. This extension will be appended to the `output_file` name to 2143 create the final, defaults to .hdr 2144 :type output_file_ext: str (optional) 2145 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2146 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2147 `True`, the function will clean the header by modifying certain lines based on a specific 2148 pattern. If `clean_header`, defaults to True 2149 :type clean_header: bool (optional) 2150 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2151 boolean flag that determines whether the #CHROM line should be removed from the header before 2152 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2153 defaults to False 2154 :type remove_chrom_line: bool (optional) 2155 :return: The function `export_header` returns the name of the temporary header file that is 2156 created. 2157 """ 2158 2159 if not header_name and not output_file: 2160 output_file = self.get_output() 2161 2162 if self.get_header(): 2163 2164 # Get header object 2165 header_obj = self.get_header() 2166 2167 # Create database 2168 db_for_header = Database(database=self.get_input()) 2169 2170 # Get real columns in the file 2171 db_header_columns = db_for_header.get_columns() 2172 2173 with tempfile.TemporaryDirectory() as tmpdir: 2174 2175 # Write header file 2176 header_file_tmp = os.path.join(tmpdir, "header") 2177 f = open(header_file_tmp, "w") 2178 vcf.Writer(f, header_obj) 2179 f.close() 2180 2181 # Replace #CHROM line with rel columns 2182 header_list = db_for_header.read_header_file( 2183 header_file=header_file_tmp 2184 ) 2185 header_list[-1] = "\t".join(db_header_columns) 2186 2187 # Remove CHROM line 2188 if remove_chrom_line: 2189 header_list.pop() 2190 2191 # Clean header 2192 if clean_header: 2193 header_list_clean = [] 2194 for head in header_list: 2195 # Clean head for malformed header 2196 head_clean = head 2197 head_clean = re.subn( 2198 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2199 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2200 head_clean, 2201 2, 2202 )[0] 2203 # Write header 2204 header_list_clean.append(head_clean) 2205 header_list = header_list_clean 2206 2207 tmp_header_name = output_file + output_file_ext 2208 2209 f = open(tmp_header_name, "w") 2210 for line in header_list: 2211 f.write(line) 2212 f.close() 2213 2214 return tmp_header_name 2215 2216 def export_variant_vcf( 2217 self, 2218 vcf_file, 2219 remove_info: bool = False, 2220 add_samples: bool = True, 2221 list_samples: list = [], 2222 index: bool = False, 2223 threads: int | None = None, 2224 ) -> bool | None: 2225 """ 2226 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2227 remove INFO field, add samples, and control compression and indexing. 2228 2229 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2230 written to. It is the output file that will contain the filtered VCF data based on the specified 2231 parameters 2232 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2233 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2234 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2235 in, defaults to False 2236 :type remove_info: bool (optional) 2237 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2238 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2239 If set to False, the samples will be removed. The default value is True, defaults to True 2240 :type add_samples: bool (optional) 2241 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2242 in the output VCF file. By default, all samples will be included. If you provide a list of 2243 samples, only those samples will be included in the output file 2244 :type list_samples: list 2245 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2246 determines whether or not to create an index for the output VCF file. If `index` is set to 2247 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2248 :type index: bool (optional) 2249 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2250 number of threads to use for exporting the VCF file. It determines how many parallel threads 2251 will be used during the export process. More threads can potentially speed up the export process 2252 by utilizing multiple cores of the processor. If 2253 :type threads: int | None 2254 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2255 method with various parameters including the output file, query, threads, sort flag, and index 2256 flag. The `export_output` method is responsible for exporting the VCF data based on the 2257 specified parameters and configurations provided in the `export_variant_vcf` function. 2258 """ 2259 2260 # Config 2261 config = self.get_config() 2262 2263 # Extract VCF 2264 log.debug("Export VCF...") 2265 2266 # Table variants 2267 table_variants = self.get_table_variants() 2268 2269 # Threads 2270 if not threads: 2271 threads = self.get_threads() 2272 2273 # Info fields 2274 if remove_info: 2275 if not isinstance(remove_info, str): 2276 remove_info = "." 2277 info_field = f"""'{remove_info}' as INFO""" 2278 else: 2279 info_field = "INFO" 2280 2281 # Samples fields 2282 if add_samples: 2283 if not list_samples: 2284 list_samples = self.get_header_sample_list() 2285 if list_samples: 2286 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2287 else: 2288 samples_fields = "" 2289 log.debug(f"samples_fields: {samples_fields}") 2290 else: 2291 samples_fields = "" 2292 2293 # Variants 2294 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2295 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} """ 2296 2297 return self.export_output( 2298 output_file=vcf_file, 2299 output_header=None, 2300 export_header=True, 2301 query=sql_query_select, 2302 parquet_partitions=None, 2303 chunk_size=config.get("chunk_size", None), 2304 threads=threads, 2305 sort=True, 2306 index=index, 2307 order_by=None, 2308 ) 2309 2310 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2311 """ 2312 It takes a list of commands and runs them in parallel using the number of threads specified 2313 2314 :param commands: A list of commands to run 2315 :param threads: The number of threads to use, defaults to 1 (optional) 2316 """ 2317 2318 run_parallel_commands(commands, threads) 2319 2320 def get_threads(self, default: int = 1) -> int: 2321 """ 2322 This function returns the number of threads to use for a job, with a default value of 1 if not 2323 specified. 2324 2325 :param default: The `default` parameter in the `get_threads` method is used to specify the 2326 default number of threads to use if no specific value is provided. If no value is provided for 2327 the `threads` parameter in the configuration or input parameters, the `default` value will be 2328 used, defaults to 1 2329 :type default: int (optional) 2330 :return: the number of threads to use for the current job. 2331 """ 2332 2333 # Config 2334 config = self.get_config() 2335 2336 # Param 2337 param = self.get_param() 2338 2339 # Input threads 2340 input_thread = param.get("threads", config.get("threads", None)) 2341 2342 # Check threads 2343 if not input_thread: 2344 threads = default 2345 elif int(input_thread) <= 0: 2346 threads = os.cpu_count() 2347 else: 2348 threads = int(input_thread) 2349 return threads 2350 2351 def get_memory(self, default: str = None) -> str: 2352 """ 2353 This function retrieves the memory value from parameters or configuration with a default value 2354 if not found. 2355 2356 :param default: The `get_memory` function takes in a default value as a string parameter. This 2357 default value is used as a fallback in case the `memory` parameter is not provided in the 2358 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2359 the function 2360 :type default: str 2361 :return: The `get_memory` function returns a string value representing the memory parameter. If 2362 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2363 return the default value provided as an argument to the function. 2364 """ 2365 2366 # Config 2367 config = self.get_config() 2368 2369 # Param 2370 param = self.get_param() 2371 2372 # Input threads 2373 input_memory = param.get("memory", config.get("memory", None)) 2374 2375 # Check threads 2376 if input_memory: 2377 memory = input_memory 2378 else: 2379 memory = default 2380 2381 return memory 2382 2383 def update_from_vcf(self, vcf_file: str) -> None: 2384 """ 2385 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2386 2387 :param vcf_file: the path to the VCF file 2388 """ 2389 2390 connexion_format = self.get_connexion_format() 2391 2392 if connexion_format in ["duckdb"]: 2393 self.update_from_vcf_duckdb(vcf_file) 2394 elif connexion_format in ["sqlite"]: 2395 self.update_from_vcf_sqlite(vcf_file) 2396 2397 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2398 """ 2399 It takes a VCF file and updates the INFO column of the variants table in the database with the 2400 INFO column of the VCF file 2401 2402 :param vcf_file: the path to the VCF file 2403 """ 2404 2405 # varaints table 2406 table_variants = self.get_table_variants() 2407 2408 # Loading VCF into temporaire table 2409 skip = self.get_header_length(file=vcf_file) 2410 vcf_df = pd.read_csv( 2411 vcf_file, 2412 sep="\t", 2413 engine="c", 2414 skiprows=skip, 2415 header=0, 2416 low_memory=False, 2417 ) 2418 sql_query_update = f""" 2419 UPDATE {table_variants} as table_variants 2420 SET INFO = concat( 2421 CASE 2422 WHEN INFO NOT IN ('', '.') 2423 THEN INFO 2424 ELSE '' 2425 END, 2426 ( 2427 SELECT 2428 concat( 2429 CASE 2430 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2431 THEN ';' 2432 ELSE '' 2433 END 2434 , 2435 CASE 2436 WHEN table_parquet.INFO NOT IN ('','.') 2437 THEN table_parquet.INFO 2438 ELSE '' 2439 END 2440 ) 2441 FROM vcf_df as table_parquet 2442 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2443 AND table_parquet.\"POS\" = table_variants.\"POS\" 2444 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2445 AND table_parquet.\"REF\" = table_variants.\"REF\" 2446 AND table_parquet.INFO NOT IN ('','.') 2447 ) 2448 ) 2449 ; 2450 """ 2451 self.conn.execute(sql_query_update) 2452 2453 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2454 """ 2455 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2456 table, then updates the INFO column of the variants table with the INFO column of the temporary 2457 table 2458 2459 :param vcf_file: The path to the VCF file you want to update the database with 2460 """ 2461 2462 # Create a temporary table for the VCF 2463 table_vcf = "tmp_vcf" 2464 sql_create = ( 2465 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2466 ) 2467 self.conn.execute(sql_create) 2468 2469 # Loading VCF into temporaire table 2470 vcf_df = pd.read_csv( 2471 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2472 ) 2473 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2474 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2475 2476 # Update table 'variants' with VCF data 2477 # warning: CONCAT as || operator 2478 sql_query_update = f""" 2479 UPDATE variants as table_variants 2480 SET INFO = CASE 2481 WHEN INFO NOT IN ('', '.') 2482 THEN INFO 2483 ELSE '' 2484 END || 2485 ( 2486 SELECT 2487 CASE 2488 WHEN table_variants.INFO NOT IN ('','.') 2489 AND table_vcf.INFO NOT IN ('','.') 2490 THEN ';' 2491 ELSE '' 2492 END || 2493 CASE 2494 WHEN table_vcf.INFO NOT IN ('','.') 2495 THEN table_vcf.INFO 2496 ELSE '' 2497 END 2498 FROM {table_vcf} as table_vcf 2499 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2500 AND table_vcf.\"POS\" = table_variants.\"POS\" 2501 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2502 AND table_vcf.\"REF\" = table_variants.\"REF\" 2503 ) 2504 """ 2505 self.conn.execute(sql_query_update) 2506 2507 # Drop temporary table 2508 sql_drop = f"DROP TABLE {table_vcf}" 2509 self.conn.execute(sql_drop) 2510 2511 def drop_variants_table(self) -> None: 2512 """ 2513 > This function drops the variants table 2514 """ 2515 2516 table_variants = self.get_table_variants() 2517 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2518 self.conn.execute(sql_table_variants) 2519 2520 def set_variant_id( 2521 self, variant_id_column: str = "variant_id", force: bool = None 2522 ) -> str: 2523 """ 2524 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2525 `#CHROM`, `POS`, `REF`, and `ALT` columns 2526 2527 :param variant_id_column: The name of the column to be created in the variants table, defaults 2528 to variant_id 2529 :type variant_id_column: str (optional) 2530 :param force: If True, the variant_id column will be created even if it already exists 2531 :type force: bool 2532 :return: The name of the column that contains the variant_id 2533 """ 2534 2535 # Assembly 2536 assembly = self.get_param().get( 2537 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2538 ) 2539 2540 # INFO/Tag prefix 2541 prefix = self.get_explode_infos_prefix() 2542 2543 # Explode INFO/SVTYPE 2544 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2545 2546 # variants table 2547 table_variants = self.get_table_variants() 2548 2549 # variant_id column 2550 if not variant_id_column: 2551 variant_id_column = "variant_id" 2552 2553 # Creta variant_id column 2554 if "variant_id" not in self.get_extra_infos() or force: 2555 2556 # Create column 2557 self.add_column( 2558 table_name=table_variants, 2559 column_name=variant_id_column, 2560 column_type="UBIGINT", 2561 default_value="0", 2562 ) 2563 2564 # Update column 2565 self.conn.execute( 2566 f""" 2567 UPDATE {table_variants} 2568 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2569 """ 2570 ) 2571 2572 # Remove added columns 2573 for added_column in added_columns: 2574 self.drop_column(column=added_column) 2575 2576 # return variant_id column name 2577 return variant_id_column 2578 2579 def get_variant_id_column( 2580 self, variant_id_column: str = "variant_id", force: bool = None 2581 ) -> str: 2582 """ 2583 This function returns the variant_id column name 2584 2585 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2586 defaults to variant_id 2587 :type variant_id_column: str (optional) 2588 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2589 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2590 if it is not already set, or if it is set 2591 :type force: bool 2592 :return: The variant_id column name. 2593 """ 2594 2595 return self.set_variant_id(variant_id_column=variant_id_column, force=force) 2596 2597 ### 2598 # Annotation 2599 ### 2600 2601 def scan_databases( 2602 self, database_formats: list["parquet"], database_releases: list = ["current"] 2603 ) -> dict: 2604 """ 2605 The function `scan_databases` scans for available databases based on specified formats and 2606 releases. 2607 2608 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2609 of the databases to be scanned. In this case, the accepted format is "parquet" 2610 :type database_formats: list ["parquet"] 2611 :param database_releases: The `database_releases` parameter is a list that specifies the 2612 releases of the databases to be scanned. In the provided function, the default value for 2613 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2614 databases that are in the "current" 2615 :type database_releases: list 2616 :return: The function `scan_databases` returns a dictionary containing information about 2617 databases that match the specified formats and releases. 2618 """ 2619 2620 # Config 2621 config = self.get_config() 2622 2623 # Param 2624 param = self.get_param() 2625 2626 # Param - Assembly 2627 assembly = param.get("assembly", config.get("assembly", None)) 2628 if not assembly: 2629 assembly = DEFAULT_ASSEMBLY 2630 log.warning(f"Default assembly '{assembly}'") 2631 2632 # Scan for availabled databases 2633 log.info( 2634 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2635 ) 2636 databases_infos_dict = databases_infos( 2637 database_folder_releases=database_releases, 2638 database_formats=database_formats, 2639 assembly=assembly, 2640 config=config, 2641 ) 2642 log.info( 2643 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2644 ) 2645 2646 return databases_infos_dict 2647 2648 def annotation(self) -> None: 2649 """ 2650 It annotates the VCF file with the annotations specified in the config file. 2651 """ 2652 2653 # Config 2654 config = self.get_config() 2655 2656 # Param 2657 param = self.get_param() 2658 2659 # Param - Assembly 2660 assembly = param.get("assembly", config.get("assembly", None)) 2661 if not assembly: 2662 assembly = DEFAULT_ASSEMBLY 2663 log.warning(f"Default assembly '{assembly}'") 2664 2665 # annotations databases folders 2666 annotations_databases = set( 2667 config.get("folders", {}) 2668 .get("databases", {}) 2669 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2670 + config.get("folders", {}) 2671 .get("databases", {}) 2672 .get("parquet", ["~/howard/databases/parquet/current"]) 2673 + config.get("folders", {}) 2674 .get("databases", {}) 2675 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2676 ) 2677 2678 # Get param annotations 2679 if param.get("annotations", None) and isinstance( 2680 param.get("annotations", None), str 2681 ): 2682 log.debug(param.get("annotations", None)) 2683 param_annotation_list = param.get("annotations").split(",") 2684 else: 2685 param_annotation_list = [] 2686 2687 # Each tools param 2688 if param.get("annotation_parquet", None) != None: 2689 log.debug( 2690 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2691 ) 2692 if isinstance(param.get("annotation_parquet", None), list): 2693 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2694 else: 2695 param_annotation_list.append(param.get("annotation_parquet")) 2696 if param.get("annotation_snpsift", None) != None: 2697 if isinstance(param.get("annotation_snpsift", None), list): 2698 param_annotation_list.append( 2699 "snpsift:" 2700 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2701 ) 2702 else: 2703 param_annotation_list.append( 2704 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2705 ) 2706 if param.get("annotation_snpeff", None) != None: 2707 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2708 if param.get("annotation_bcftools", None) != None: 2709 if isinstance(param.get("annotation_bcftools", None), list): 2710 param_annotation_list.append( 2711 "bcftools:" 2712 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2713 ) 2714 else: 2715 param_annotation_list.append( 2716 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2717 ) 2718 if param.get("annotation_annovar", None) != None: 2719 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2720 if param.get("annotation_exomiser", None) != None: 2721 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2722 if param.get("annotation_splice", None) != None: 2723 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2724 2725 # Merge param annotations list 2726 param["annotations"] = ",".join(param_annotation_list) 2727 2728 # debug 2729 log.debug(f"param_annotations={param['annotations']}") 2730 2731 if param.get("annotations"): 2732 2733 # Log 2734 # log.info("Annotations - Check annotation parameters") 2735 2736 if not "annotation" in param: 2737 param["annotation"] = {} 2738 2739 # List of annotations parameters 2740 annotations_list_input = {} 2741 if isinstance(param.get("annotations", None), str): 2742 annotation_file_list = [ 2743 value for value in param.get("annotations", "").split(",") 2744 ] 2745 for annotation_file in annotation_file_list: 2746 annotations_list_input[annotation_file] = {"INFO": None} 2747 else: 2748 annotations_list_input = param.get("annotations", {}) 2749 2750 log.info(f"Quick Annotations:") 2751 for annotation_key in list(annotations_list_input.keys()): 2752 log.info(f" {annotation_key}") 2753 2754 # List of annotations and associated fields 2755 annotations_list = {} 2756 2757 for annotation_file in annotations_list_input: 2758 2759 # Explode annotations if ALL 2760 if ( 2761 annotation_file.upper() == "ALL" 2762 or annotation_file.upper().startswith("ALL:") 2763 ): 2764 2765 # check ALL parameters (formats, releases) 2766 annotation_file_split = annotation_file.split(":") 2767 database_formats = "parquet" 2768 database_releases = "current" 2769 for annotation_file_option in annotation_file_split[1:]: 2770 database_all_options_split = annotation_file_option.split("=") 2771 if database_all_options_split[0] == "format": 2772 database_formats = database_all_options_split[1].split("+") 2773 if database_all_options_split[0] == "release": 2774 database_releases = database_all_options_split[1].split("+") 2775 2776 # Scan for availabled databases 2777 databases_infos_dict = self.scan_databases( 2778 database_formats=database_formats, 2779 database_releases=database_releases, 2780 ) 2781 2782 # Add found databases in annotation parameters 2783 for database_infos in databases_infos_dict.keys(): 2784 annotations_list[database_infos] = {"INFO": None} 2785 2786 else: 2787 annotations_list[annotation_file] = annotations_list_input[ 2788 annotation_file 2789 ] 2790 2791 # Check each databases 2792 if len(annotations_list): 2793 2794 log.info( 2795 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 2796 ) 2797 2798 for annotation_file in annotations_list: 2799 2800 # Init 2801 annotations = annotations_list.get(annotation_file, None) 2802 2803 # Annotation snpEff 2804 if annotation_file.startswith("snpeff"): 2805 2806 log.debug(f"Quick Annotation snpEff") 2807 2808 if "snpeff" not in param["annotation"]: 2809 param["annotation"]["snpeff"] = {} 2810 2811 if "options" not in param["annotation"]["snpeff"]: 2812 param["annotation"]["snpeff"]["options"] = "" 2813 2814 # snpEff options in annotations 2815 param["annotation"]["snpeff"]["options"] = "".join( 2816 annotation_file.split(":")[1:] 2817 ) 2818 2819 # Annotation Annovar 2820 elif annotation_file.startswith("annovar"): 2821 2822 log.debug(f"Quick Annotation Annovar") 2823 2824 if "annovar" not in param["annotation"]: 2825 param["annotation"]["annovar"] = {} 2826 2827 if "annotations" not in param["annotation"]["annovar"]: 2828 param["annotation"]["annovar"]["annotations"] = {} 2829 2830 # Options 2831 annotation_file_split = annotation_file.split(":") 2832 for annotation_file_annotation in annotation_file_split[1:]: 2833 if annotation_file_annotation: 2834 param["annotation"]["annovar"]["annotations"][ 2835 annotation_file_annotation 2836 ] = annotations 2837 2838 # Annotation Exomiser 2839 elif annotation_file.startswith("exomiser"): 2840 2841 log.debug(f"Quick Annotation Exomiser") 2842 2843 param["annotation"]["exomiser"] = params_string_to_dict( 2844 annotation_file 2845 ) 2846 2847 # Annotation Splice 2848 elif annotation_file.startswith("splice"): 2849 2850 log.debug(f"Quick Annotation Splice") 2851 2852 param["annotation"]["splice"] = params_string_to_dict( 2853 annotation_file 2854 ) 2855 2856 # Annotation Parquet or BCFTOOLS 2857 else: 2858 2859 # Tools detection 2860 if annotation_file.startswith("bcftools:"): 2861 annotation_tool_initial = "bcftools" 2862 annotation_file = ":".join(annotation_file.split(":")[1:]) 2863 elif annotation_file.startswith("snpsift:"): 2864 annotation_tool_initial = "snpsift" 2865 annotation_file = ":".join(annotation_file.split(":")[1:]) 2866 else: 2867 annotation_tool_initial = None 2868 2869 # list of files 2870 annotation_file_list = annotation_file.replace("+", ":").split( 2871 ":" 2872 ) 2873 2874 for annotation_file in annotation_file_list: 2875 2876 if annotation_file: 2877 2878 # Annotation tool initial 2879 annotation_tool = annotation_tool_initial 2880 2881 # Find file 2882 annotation_file_found = None 2883 2884 # Expand user 2885 annotation_file = full_path(annotation_file) 2886 2887 if os.path.exists(annotation_file): 2888 annotation_file_found = annotation_file 2889 2890 else: 2891 # Find within assembly folders 2892 for annotations_database in annotations_databases: 2893 found_files = find_all( 2894 annotation_file, 2895 os.path.join( 2896 annotations_database, assembly 2897 ), 2898 ) 2899 if len(found_files) > 0: 2900 annotation_file_found = found_files[0] 2901 break 2902 if not annotation_file_found and not assembly: 2903 # Find within folders 2904 for ( 2905 annotations_database 2906 ) in annotations_databases: 2907 found_files = find_all( 2908 annotation_file, annotations_database 2909 ) 2910 if len(found_files) > 0: 2911 annotation_file_found = found_files[0] 2912 break 2913 log.debug( 2914 f"for {annotation_file} annotation_file_found={annotation_file_found}" 2915 ) 2916 2917 # Full path 2918 annotation_file_found = full_path(annotation_file_found) 2919 2920 if annotation_file_found: 2921 2922 database = Database(database=annotation_file_found) 2923 quick_annotation_format = database.get_format() 2924 quick_annotation_is_compressed = ( 2925 database.is_compressed() 2926 ) 2927 quick_annotation_is_indexed = os.path.exists( 2928 f"{annotation_file_found}.tbi" 2929 ) 2930 bcftools_preference = False 2931 2932 # Check Annotation Tool 2933 if not annotation_tool: 2934 if ( 2935 bcftools_preference 2936 and quick_annotation_format 2937 in ["vcf", "bed"] 2938 and quick_annotation_is_compressed 2939 and quick_annotation_is_indexed 2940 ): 2941 annotation_tool = "bcftools" 2942 elif quick_annotation_format in [ 2943 "vcf", 2944 "bed", 2945 "tsv", 2946 "tsv", 2947 "csv", 2948 "json", 2949 "tbl", 2950 "parquet", 2951 "duckdb", 2952 ]: 2953 annotation_tool = "parquet" 2954 else: 2955 log.error( 2956 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 2957 ) 2958 raise ValueError( 2959 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 2960 ) 2961 2962 log.debug( 2963 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 2964 ) 2965 2966 # Annotation Tool dispatch 2967 if annotation_tool: 2968 if annotation_tool not in param["annotation"]: 2969 param["annotation"][annotation_tool] = {} 2970 if ( 2971 "annotations" 2972 not in param["annotation"][annotation_tool] 2973 ): 2974 param["annotation"][annotation_tool][ 2975 "annotations" 2976 ] = {} 2977 param["annotation"][annotation_tool][ 2978 "annotations" 2979 ][annotation_file_found] = annotations 2980 2981 else: 2982 log.error( 2983 f"Quick Annotation File {annotation_file} does NOT exist" 2984 ) 2985 2986 self.set_param(param) 2987 2988 if param.get("annotation", None): 2989 log.info("Annotations") 2990 if param.get("annotation", {}).get("parquet", None): 2991 log.info("Annotations 'parquet'...") 2992 self.annotation_parquet() 2993 if param.get("annotation", {}).get("bcftools", None): 2994 log.info("Annotations 'bcftools'...") 2995 self.annotation_bcftools() 2996 if param.get("annotation", {}).get("snpsift", None): 2997 log.info("Annotations 'snpsift'...") 2998 self.annotation_snpsift() 2999 if param.get("annotation", {}).get("annovar", None): 3000 log.info("Annotations 'annovar'...") 3001 self.annotation_annovar() 3002 if param.get("annotation", {}).get("snpeff", None): 3003 log.info("Annotations 'snpeff'...") 3004 self.annotation_snpeff() 3005 if param.get("annotation", {}).get("exomiser", None) is not None: 3006 log.info("Annotations 'exomiser'...") 3007 self.annotation_exomiser() 3008 if param.get("annotation", {}).get("splice", None) is not None: 3009 log.info("Annotations 'splice' ...") 3010 self.annotation_splice() 3011 3012 # Explode INFOS fields into table fields 3013 if self.get_explode_infos(): 3014 self.explode_infos( 3015 prefix=self.get_explode_infos_prefix(), 3016 fields=self.get_explode_infos_fields(), 3017 force=True, 3018 ) 3019 3020 def annotation_snpsift(self, threads: int = None) -> None: 3021 """ 3022 This function annotate with bcftools 3023 3024 :param threads: Number of threads to use 3025 :return: the value of the variable "return_value". 3026 """ 3027 3028 # DEBUG 3029 log.debug("Start annotation with bcftools databases") 3030 3031 # Threads 3032 if not threads: 3033 threads = self.get_threads() 3034 log.debug("Threads: " + str(threads)) 3035 3036 # Config 3037 config = self.get_config() 3038 log.debug("Config: " + str(config)) 3039 3040 # Config - snpSift 3041 snpsift_bin_command = get_bin_command( 3042 bin="SnpSift.jar", 3043 tool="snpsift", 3044 bin_type="jar", 3045 config=config, 3046 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3047 ) 3048 if not snpsift_bin_command: 3049 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3050 log.error(msg_err) 3051 raise ValueError(msg_err) 3052 3053 # Config - bcftools 3054 bcftools_bin_command = get_bin_command( 3055 bin="bcftools", 3056 tool="bcftools", 3057 bin_type="bin", 3058 config=config, 3059 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3060 ) 3061 if not bcftools_bin_command: 3062 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3063 log.error(msg_err) 3064 raise ValueError(msg_err) 3065 3066 # Config - BCFTools databases folders 3067 databases_folders = set( 3068 self.get_config() 3069 .get("folders", {}) 3070 .get("databases", {}) 3071 .get("annotations", ["."]) 3072 + self.get_config() 3073 .get("folders", {}) 3074 .get("databases", {}) 3075 .get("bcftools", ["."]) 3076 ) 3077 log.debug("Databases annotations: " + str(databases_folders)) 3078 3079 # Param 3080 annotations = ( 3081 self.get_param() 3082 .get("annotation", {}) 3083 .get("snpsift", {}) 3084 .get("annotations", None) 3085 ) 3086 log.debug("Annotations: " + str(annotations)) 3087 3088 # Assembly 3089 assembly = self.get_param().get( 3090 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3091 ) 3092 3093 # Data 3094 table_variants = self.get_table_variants() 3095 3096 # Check if not empty 3097 log.debug("Check if not empty") 3098 sql_query_chromosomes = ( 3099 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3100 ) 3101 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3102 if not sql_query_chromosomes_df["count"][0]: 3103 log.info(f"VCF empty") 3104 return 3105 3106 # VCF header 3107 vcf_reader = self.get_header() 3108 log.debug("Initial header: " + str(vcf_reader.infos)) 3109 3110 # Existing annotations 3111 for vcf_annotation in self.get_header().infos: 3112 3113 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3114 log.debug( 3115 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3116 ) 3117 3118 if annotations: 3119 3120 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3121 3122 # Export VCF file 3123 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3124 3125 # Init 3126 commands = {} 3127 3128 for annotation in annotations: 3129 annotation_fields = annotations[annotation] 3130 3131 # Annotation Name 3132 annotation_name = os.path.basename(annotation) 3133 3134 if not annotation_fields: 3135 annotation_fields = {"INFO": None} 3136 3137 log.debug(f"Annotation '{annotation_name}'") 3138 log.debug( 3139 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3140 ) 3141 3142 # Create Database 3143 database = Database( 3144 database=annotation, 3145 databases_folders=databases_folders, 3146 assembly=assembly, 3147 ) 3148 3149 # Find files 3150 db_file = database.get_database() 3151 db_file = full_path(db_file) 3152 db_hdr_file = database.get_header_file() 3153 db_hdr_file = full_path(db_hdr_file) 3154 db_file_type = database.get_format() 3155 db_tbi_file = f"{db_file}.tbi" 3156 db_file_compressed = database.is_compressed() 3157 3158 # Check if compressed 3159 if not db_file_compressed: 3160 log.error( 3161 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3162 ) 3163 raise ValueError( 3164 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3165 ) 3166 3167 # Check if indexed 3168 if not os.path.exists(db_tbi_file): 3169 log.error( 3170 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3171 ) 3172 raise ValueError( 3173 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3174 ) 3175 3176 # Check index - try to create if not exists 3177 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3178 log.error("Annotation failed: database not valid") 3179 log.error(f"Annotation annotation file: {db_file}") 3180 log.error(f"Annotation annotation header: {db_hdr_file}") 3181 log.error(f"Annotation annotation index: {db_tbi_file}") 3182 raise ValueError( 3183 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3184 ) 3185 else: 3186 3187 log.debug( 3188 f"Annotation '{annotation}' - file: " 3189 + str(db_file) 3190 + " and " 3191 + str(db_hdr_file) 3192 ) 3193 3194 # Load header as VCF object 3195 db_hdr_vcf = Variants(input=db_hdr_file) 3196 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3197 log.debug( 3198 "Annotation database header: " 3199 + str(db_hdr_vcf_header_infos) 3200 ) 3201 3202 # For all fields in database 3203 annotation_fields_full = False 3204 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3205 annotation_fields = { 3206 key: key for key in db_hdr_vcf_header_infos 3207 } 3208 log.debug( 3209 "Annotation database header - All annotations added: " 3210 + str(annotation_fields) 3211 ) 3212 annotation_fields_full = True 3213 3214 # # Create file for field rename 3215 # log.debug("Create file for field rename") 3216 # tmp_rename = NamedTemporaryFile( 3217 # prefix=self.get_prefix(), 3218 # dir=self.get_tmp_dir(), 3219 # suffix=".rename", 3220 # delete=False, 3221 # ) 3222 # tmp_rename_name = tmp_rename.name 3223 # tmp_files.append(tmp_rename_name) 3224 3225 # Number of fields 3226 nb_annotation_field = 0 3227 annotation_list = [] 3228 annotation_infos_rename_list = [] 3229 3230 for annotation_field in annotation_fields: 3231 3232 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3233 annotation_fields_new_name = annotation_fields.get( 3234 annotation_field, annotation_field 3235 ) 3236 if not annotation_fields_new_name: 3237 annotation_fields_new_name = annotation_field 3238 3239 # Check if field is in DB and if field is not elready in input data 3240 if ( 3241 annotation_field in db_hdr_vcf.get_header().infos 3242 and annotation_fields_new_name 3243 not in self.get_header().infos 3244 ): 3245 3246 log.info( 3247 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3248 ) 3249 3250 # BCFTools annotate param to rename fields 3251 if annotation_field != annotation_fields_new_name: 3252 annotation_infos_rename_list.append( 3253 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3254 ) 3255 3256 # Add INFO field to header 3257 db_hdr_vcf_header_infos_number = ( 3258 db_hdr_vcf_header_infos[annotation_field].num or "." 3259 ) 3260 db_hdr_vcf_header_infos_type = ( 3261 db_hdr_vcf_header_infos[annotation_field].type 3262 or "String" 3263 ) 3264 db_hdr_vcf_header_infos_description = ( 3265 db_hdr_vcf_header_infos[annotation_field].desc 3266 or f"{annotation_field} description" 3267 ) 3268 db_hdr_vcf_header_infos_source = ( 3269 db_hdr_vcf_header_infos[annotation_field].source 3270 or "unknown" 3271 ) 3272 db_hdr_vcf_header_infos_version = ( 3273 db_hdr_vcf_header_infos[annotation_field].version 3274 or "unknown" 3275 ) 3276 3277 vcf_reader.infos[annotation_fields_new_name] = ( 3278 vcf.parser._Info( 3279 annotation_fields_new_name, 3280 db_hdr_vcf_header_infos_number, 3281 db_hdr_vcf_header_infos_type, 3282 db_hdr_vcf_header_infos_description, 3283 db_hdr_vcf_header_infos_source, 3284 db_hdr_vcf_header_infos_version, 3285 self.code_type_map[ 3286 db_hdr_vcf_header_infos_type 3287 ], 3288 ) 3289 ) 3290 3291 annotation_list.append(annotation_field) 3292 3293 nb_annotation_field += 1 3294 3295 else: 3296 3297 if ( 3298 annotation_field 3299 not in db_hdr_vcf.get_header().infos 3300 ): 3301 log.warning( 3302 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3303 ) 3304 if ( 3305 annotation_fields_new_name 3306 in self.get_header().infos 3307 ): 3308 log.warning( 3309 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3310 ) 3311 3312 log.info( 3313 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3314 ) 3315 3316 annotation_infos = ",".join(annotation_list) 3317 3318 if annotation_infos != "": 3319 3320 # Annotated VCF (and error file) 3321 tmp_annotation_vcf_name = os.path.join( 3322 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3323 ) 3324 tmp_annotation_vcf_name_err = ( 3325 tmp_annotation_vcf_name + ".err" 3326 ) 3327 3328 # Add fields to annotate 3329 if not annotation_fields_full: 3330 annotation_infos_option = f"-info {annotation_infos}" 3331 else: 3332 annotation_infos_option = "" 3333 3334 # Info fields rename 3335 if annotation_infos_rename_list: 3336 annotation_infos_rename = " -c " + ",".join( 3337 annotation_infos_rename_list 3338 ) 3339 else: 3340 annotation_infos_rename = "" 3341 3342 # Annotate command 3343 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3344 3345 # Add command 3346 commands[command_annotate] = tmp_annotation_vcf_name 3347 3348 if commands: 3349 3350 # Export VCF file 3351 self.export_variant_vcf( 3352 vcf_file=tmp_vcf_name, 3353 remove_info=True, 3354 add_samples=False, 3355 index=True, 3356 ) 3357 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3358 3359 # Num command 3360 nb_command = 0 3361 3362 # Annotate 3363 for command_annotate in commands: 3364 nb_command += 1 3365 log.info( 3366 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3367 ) 3368 log.debug(f"command_annotate={command_annotate}") 3369 run_parallel_commands([command_annotate], threads) 3370 3371 # Debug 3372 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3373 3374 # Update variants 3375 log.info( 3376 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3377 ) 3378 self.update_from_vcf(commands[command_annotate]) 3379 3380 def annotation_bcftools(self, threads: int = None) -> None: 3381 """ 3382 This function annotate with bcftools 3383 3384 :param threads: Number of threads to use 3385 :return: the value of the variable "return_value". 3386 """ 3387 3388 # DEBUG 3389 log.debug("Start annotation with bcftools databases") 3390 3391 # Threads 3392 if not threads: 3393 threads = self.get_threads() 3394 log.debug("Threads: " + str(threads)) 3395 3396 # Config 3397 config = self.get_config() 3398 log.debug("Config: " + str(config)) 3399 3400 # DEBUG 3401 delete_tmp = True 3402 if self.get_config().get("verbosity", "warning") in ["debug"]: 3403 delete_tmp = False 3404 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 3405 3406 # Config - BCFTools bin command 3407 bcftools_bin_command = get_bin_command( 3408 bin="bcftools", 3409 tool="bcftools", 3410 bin_type="bin", 3411 config=config, 3412 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3413 ) 3414 if not bcftools_bin_command: 3415 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3416 log.error(msg_err) 3417 raise ValueError(msg_err) 3418 3419 # Config - BCFTools databases folders 3420 databases_folders = set( 3421 self.get_config() 3422 .get("folders", {}) 3423 .get("databases", {}) 3424 .get("annotations", ["."]) 3425 + self.get_config() 3426 .get("folders", {}) 3427 .get("databases", {}) 3428 .get("bcftools", ["."]) 3429 ) 3430 log.debug("Databases annotations: " + str(databases_folders)) 3431 3432 # Param 3433 annotations = ( 3434 self.get_param() 3435 .get("annotation", {}) 3436 .get("bcftools", {}) 3437 .get("annotations", None) 3438 ) 3439 log.debug("Annotations: " + str(annotations)) 3440 3441 # Assembly 3442 assembly = self.get_param().get( 3443 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3444 ) 3445 3446 # Data 3447 table_variants = self.get_table_variants() 3448 3449 # Check if not empty 3450 log.debug("Check if not empty") 3451 sql_query_chromosomes = ( 3452 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3453 ) 3454 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3455 if not sql_query_chromosomes_df["count"][0]: 3456 log.info(f"VCF empty") 3457 return 3458 3459 # Export in VCF 3460 log.debug("Create initial file to annotate") 3461 tmp_vcf = NamedTemporaryFile( 3462 prefix=self.get_prefix(), 3463 dir=self.get_tmp_dir(), 3464 suffix=".vcf.gz", 3465 delete=False, 3466 ) 3467 tmp_vcf_name = tmp_vcf.name 3468 3469 # VCF header 3470 vcf_reader = self.get_header() 3471 log.debug("Initial header: " + str(vcf_reader.infos)) 3472 3473 # Existing annotations 3474 for vcf_annotation in self.get_header().infos: 3475 3476 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3477 log.debug( 3478 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3479 ) 3480 3481 if annotations: 3482 3483 tmp_ann_vcf_list = [] 3484 commands = [] 3485 tmp_files = [] 3486 err_files = [] 3487 3488 for annotation in annotations: 3489 annotation_fields = annotations[annotation] 3490 3491 # Annotation Name 3492 annotation_name = os.path.basename(annotation) 3493 3494 if not annotation_fields: 3495 annotation_fields = {"INFO": None} 3496 3497 log.debug(f"Annotation '{annotation_name}'") 3498 log.debug( 3499 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3500 ) 3501 3502 # Create Database 3503 database = Database( 3504 database=annotation, 3505 databases_folders=databases_folders, 3506 assembly=assembly, 3507 ) 3508 3509 # Find files 3510 db_file = database.get_database() 3511 db_file = full_path(db_file) 3512 db_hdr_file = database.get_header_file() 3513 db_hdr_file = full_path(db_hdr_file) 3514 db_file_type = database.get_format() 3515 db_tbi_file = f"{db_file}.tbi" 3516 db_file_compressed = database.is_compressed() 3517 3518 # Check if compressed 3519 if not db_file_compressed: 3520 log.error( 3521 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3522 ) 3523 raise ValueError( 3524 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3525 ) 3526 3527 # Check if indexed 3528 if not os.path.exists(db_tbi_file): 3529 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 3530 raise ValueError( 3531 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3532 ) 3533 3534 # Check index - try to create if not exists 3535 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3536 log.error("Annotation failed: database not valid") 3537 log.error(f"Annotation annotation file: {db_file}") 3538 log.error(f"Annotation annotation header: {db_hdr_file}") 3539 log.error(f"Annotation annotation index: {db_tbi_file}") 3540 raise ValueError( 3541 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3542 ) 3543 else: 3544 3545 log.debug( 3546 f"Annotation '{annotation}' - file: " 3547 + str(db_file) 3548 + " and " 3549 + str(db_hdr_file) 3550 ) 3551 3552 # Load header as VCF object 3553 db_hdr_vcf = Variants(input=db_hdr_file) 3554 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3555 log.debug( 3556 "Annotation database header: " + str(db_hdr_vcf_header_infos) 3557 ) 3558 3559 # For all fields in database 3560 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3561 annotation_fields = { 3562 key: key for key in db_hdr_vcf_header_infos 3563 } 3564 log.debug( 3565 "Annotation database header - All annotations added: " 3566 + str(annotation_fields) 3567 ) 3568 3569 # Number of fields 3570 nb_annotation_field = 0 3571 annotation_list = [] 3572 3573 for annotation_field in annotation_fields: 3574 3575 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3576 annotation_fields_new_name = annotation_fields.get( 3577 annotation_field, annotation_field 3578 ) 3579 if not annotation_fields_new_name: 3580 annotation_fields_new_name = annotation_field 3581 3582 # Check if field is in DB and if field is not elready in input data 3583 if ( 3584 annotation_field in db_hdr_vcf.get_header().infos 3585 and annotation_fields_new_name 3586 not in self.get_header().infos 3587 ): 3588 3589 log.info( 3590 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3591 ) 3592 3593 # Add INFO field to header 3594 db_hdr_vcf_header_infos_number = ( 3595 db_hdr_vcf_header_infos[annotation_field].num or "." 3596 ) 3597 db_hdr_vcf_header_infos_type = ( 3598 db_hdr_vcf_header_infos[annotation_field].type 3599 or "String" 3600 ) 3601 db_hdr_vcf_header_infos_description = ( 3602 db_hdr_vcf_header_infos[annotation_field].desc 3603 or f"{annotation_field} description" 3604 ) 3605 db_hdr_vcf_header_infos_source = ( 3606 db_hdr_vcf_header_infos[annotation_field].source 3607 or "unknown" 3608 ) 3609 db_hdr_vcf_header_infos_version = ( 3610 db_hdr_vcf_header_infos[annotation_field].version 3611 or "unknown" 3612 ) 3613 3614 vcf_reader.infos[annotation_fields_new_name] = ( 3615 vcf.parser._Info( 3616 annotation_fields_new_name, 3617 db_hdr_vcf_header_infos_number, 3618 db_hdr_vcf_header_infos_type, 3619 db_hdr_vcf_header_infos_description, 3620 db_hdr_vcf_header_infos_source, 3621 db_hdr_vcf_header_infos_version, 3622 self.code_type_map[db_hdr_vcf_header_infos_type], 3623 ) 3624 ) 3625 3626 # annotation_list.append(annotation_field) 3627 if annotation_field != annotation_fields_new_name: 3628 annotation_list.append( 3629 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3630 ) 3631 else: 3632 annotation_list.append(annotation_field) 3633 3634 nb_annotation_field += 1 3635 3636 else: 3637 3638 if annotation_field not in db_hdr_vcf.get_header().infos: 3639 log.warning( 3640 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 3641 ) 3642 if annotation_fields_new_name in self.get_header().infos: 3643 log.warning( 3644 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 3645 ) 3646 3647 log.info( 3648 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3649 ) 3650 3651 annotation_infos = ",".join(annotation_list) 3652 3653 if annotation_infos != "": 3654 3655 # Protect header for bcftools (remove "#CHROM" and variants line) 3656 log.debug("Protect Header file - remove #CHROM line if exists") 3657 tmp_header_vcf = NamedTemporaryFile( 3658 prefix=self.get_prefix(), 3659 dir=self.get_tmp_dir(), 3660 suffix=".hdr", 3661 delete=False, 3662 ) 3663 tmp_header_vcf_name = tmp_header_vcf.name 3664 tmp_files.append(tmp_header_vcf_name) 3665 # Command 3666 if db_hdr_file.endswith(".gz"): 3667 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3668 else: 3669 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3670 # Run 3671 run_parallel_commands([command_extract_header], 1) 3672 3673 # Find chomosomes 3674 log.debug("Find chromosomes ") 3675 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 3676 sql_query_chromosomes_df = self.get_query_to_df( 3677 sql_query_chromosomes 3678 ) 3679 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 3680 3681 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 3682 3683 # BED columns in the annotation file 3684 if db_file_type in ["bed"]: 3685 annotation_infos = "CHROM,POS,POS," + annotation_infos 3686 3687 for chrom in chomosomes_list: 3688 3689 # Create BED on initial VCF 3690 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 3691 tmp_bed = NamedTemporaryFile( 3692 prefix=self.get_prefix(), 3693 dir=self.get_tmp_dir(), 3694 suffix=".bed", 3695 delete=False, 3696 ) 3697 tmp_bed_name = tmp_bed.name 3698 tmp_files.append(tmp_bed_name) 3699 3700 # Detecte regions 3701 log.debug( 3702 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 3703 ) 3704 window = 1000000 3705 sql_query_intervals_for_bed = f""" 3706 SELECT \"#CHROM\", 3707 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 3708 \"POS\"+{window} 3709 FROM {table_variants} as table_variants 3710 WHERE table_variants.\"#CHROM\" = '{chrom}' 3711 """ 3712 regions = self.conn.execute( 3713 sql_query_intervals_for_bed 3714 ).fetchall() 3715 merged_regions = merge_regions(regions) 3716 log.debug( 3717 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 3718 ) 3719 3720 header = ["#CHROM", "START", "END"] 3721 with open(tmp_bed_name, "w") as f: 3722 # Write the header with tab delimiter 3723 f.write("\t".join(header) + "\n") 3724 for d in merged_regions: 3725 # Write each data row with tab delimiter 3726 f.write("\t".join(map(str, d)) + "\n") 3727 3728 # Tmp files 3729 tmp_annotation_vcf = NamedTemporaryFile( 3730 prefix=self.get_prefix(), 3731 dir=self.get_tmp_dir(), 3732 suffix=".vcf.gz", 3733 delete=False, 3734 ) 3735 tmp_annotation_vcf_name = tmp_annotation_vcf.name 3736 tmp_files.append(tmp_annotation_vcf_name) 3737 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 3738 tmp_annotation_vcf_name_err = ( 3739 tmp_annotation_vcf_name + ".err" 3740 ) 3741 err_files.append(tmp_annotation_vcf_name_err) 3742 3743 # Annotate Command 3744 log.debug( 3745 f"Annotation '{annotation}' - add bcftools command" 3746 ) 3747 3748 # Command 3749 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3750 3751 # Add command 3752 commands.append(command_annotate) 3753 3754 # if some commands 3755 if commands: 3756 3757 # Export VCF file 3758 self.export_variant_vcf( 3759 vcf_file=tmp_vcf_name, 3760 remove_info=True, 3761 add_samples=False, 3762 index=True, 3763 ) 3764 3765 # Threads 3766 # calculate threads for annotated commands 3767 if commands: 3768 threads_bcftools_annotate = round(threads / len(commands)) 3769 else: 3770 threads_bcftools_annotate = 1 3771 3772 if not threads_bcftools_annotate: 3773 threads_bcftools_annotate = 1 3774 3775 # Add threads option to bcftools commands 3776 if threads_bcftools_annotate > 1: 3777 commands_threaded = [] 3778 for command in commands: 3779 commands_threaded.append( 3780 command.replace( 3781 f"{bcftools_bin_command} annotate ", 3782 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 3783 ) 3784 ) 3785 commands = commands_threaded 3786 3787 # Command annotation multithreading 3788 log.debug(f"Annotation - Annotation commands: " + str(commands)) 3789 log.info( 3790 f"Annotation - Annotation multithreaded in " 3791 + str(len(commands)) 3792 + " commands" 3793 ) 3794 3795 run_parallel_commands(commands, threads) 3796 3797 # Merge 3798 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 3799 3800 if tmp_ann_vcf_list_cmd: 3801 3802 # Tmp file 3803 tmp_annotate_vcf = NamedTemporaryFile( 3804 prefix=self.get_prefix(), 3805 dir=self.get_tmp_dir(), 3806 suffix=".vcf.gz", 3807 delete=True, 3808 ) 3809 tmp_annotate_vcf_name = tmp_annotate_vcf.name 3810 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 3811 err_files.append(tmp_annotate_vcf_name_err) 3812 3813 # Tmp file remove command 3814 tmp_files_remove_command = "" 3815 if tmp_files: 3816 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 3817 3818 # Command merge 3819 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 3820 log.info( 3821 f"Annotation - Annotation merging " 3822 + str(len(commands)) 3823 + " annotated files" 3824 ) 3825 log.debug(f"Annotation - merge command: {merge_command}") 3826 run_parallel_commands([merge_command], 1) 3827 3828 # Error messages 3829 log.info(f"Error/Warning messages:") 3830 error_message_command_all = [] 3831 error_message_command_warning = [] 3832 error_message_command_err = [] 3833 for err_file in err_files: 3834 with open(err_file, "r") as f: 3835 for line in f: 3836 message = line.strip() 3837 error_message_command_all.append(message) 3838 if line.startswith("[W::"): 3839 error_message_command_warning.append(message) 3840 if line.startswith("[E::"): 3841 error_message_command_err.append( 3842 f"{err_file}: " + message 3843 ) 3844 # log info 3845 for message in list( 3846 set(error_message_command_err + error_message_command_warning) 3847 ): 3848 log.info(f" {message}") 3849 # debug info 3850 for message in list(set(error_message_command_all)): 3851 log.debug(f" {message}") 3852 # failed 3853 if len(error_message_command_err): 3854 log.error("Annotation failed: Error in commands") 3855 raise ValueError("Annotation failed: Error in commands") 3856 3857 # Update variants 3858 log.info(f"Annotation - Updating...") 3859 self.update_from_vcf(tmp_annotate_vcf_name) 3860 3861 def annotation_exomiser(self, threads: int = None) -> None: 3862 """ 3863 This function annotate with Exomiser 3864 3865 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 3866 - "analysis" (dict/file): 3867 Full analysis dictionnary parameters (see Exomiser docs). 3868 Either a dict, or a file in JSON or YAML format. 3869 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 3870 Default : None 3871 - "preset" (string): 3872 Analysis preset (available in config folder). 3873 Used if no full "analysis" is provided. 3874 Default: "exome" 3875 - "phenopacket" (dict/file): 3876 Samples and phenotipic features parameters (see Exomiser docs). 3877 Either a dict, or a file in JSON or YAML format. 3878 Default: None 3879 - "subject" (dict): 3880 Sample parameters (see Exomiser docs). 3881 Example: 3882 "subject": 3883 { 3884 "id": "ISDBM322017", 3885 "sex": "FEMALE" 3886 } 3887 Default: None 3888 - "sample" (string): 3889 Sample name to construct "subject" section: 3890 "subject": 3891 { 3892 "id": "<sample>", 3893 "sex": "UNKNOWN_SEX" 3894 } 3895 Default: None 3896 - "phenotypicFeatures" (dict) 3897 Phenotypic features to construct "subject" section. 3898 Example: 3899 "phenotypicFeatures": 3900 [ 3901 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 3902 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 3903 ] 3904 - "hpo" (list) 3905 List of HPO ids as phenotypic features. 3906 Example: 3907 "hpo": ['0001156', '0001363', '0011304', '0010055'] 3908 Default: [] 3909 - "outputOptions" (dict): 3910 Output options (see Exomiser docs). 3911 Default: 3912 "output_options" = 3913 { 3914 "outputContributingVariantsOnly": False, 3915 "numGenes": 0, 3916 "outputFormats": ["TSV_VARIANT", "VCF"] 3917 } 3918 - "transcript_source" (string): 3919 Transcript source (either "refseq", "ucsc", "ensembl") 3920 Default: "refseq" 3921 - "exomiser_to_info" (boolean): 3922 Add exomiser TSV file columns as INFO fields in VCF. 3923 Default: False 3924 - "release" (string): 3925 Exomise database release. 3926 If not exists, database release will be downloaded (take a while). 3927 Default: None (provided by application.properties configuration file) 3928 - "exomiser_application_properties" (file): 3929 Exomiser configuration file (see Exomiser docs). 3930 Useful to automatically download databases (especially for specific genome databases). 3931 3932 Notes: 3933 - If no sample in parameters, first sample in VCF will be chosen 3934 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 3935 3936 :param threads: The number of threads to use 3937 :return: None. 3938 """ 3939 3940 # DEBUG 3941 log.debug("Start annotation with Exomiser databases") 3942 3943 # Threads 3944 if not threads: 3945 threads = self.get_threads() 3946 log.debug("Threads: " + str(threads)) 3947 3948 # Config 3949 config = self.get_config() 3950 log.debug("Config: " + str(config)) 3951 3952 # Config - Folders - Databases 3953 databases_folders = ( 3954 config.get("folders", {}) 3955 .get("databases", {}) 3956 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 3957 ) 3958 databases_folders = full_path(databases_folders) 3959 if not os.path.exists(databases_folders): 3960 log.error(f"Databases annotations: {databases_folders} NOT found") 3961 log.debug("Databases annotations: " + str(databases_folders)) 3962 3963 # Config - Exomiser 3964 exomiser_bin_command = get_bin_command( 3965 bin="exomiser-cli*.jar", 3966 tool="exomiser", 3967 bin_type="jar", 3968 config=config, 3969 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 3970 ) 3971 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 3972 if not exomiser_bin_command: 3973 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 3974 log.error(msg_err) 3975 raise ValueError(msg_err) 3976 3977 # Param 3978 param = self.get_param() 3979 log.debug("Param: " + str(param)) 3980 3981 # Param - Exomiser 3982 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 3983 log.debug(f"Param Exomiser: {param_exomiser}") 3984 3985 # Param - Assembly 3986 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 3987 log.debug("Assembly: " + str(assembly)) 3988 3989 # Data 3990 table_variants = self.get_table_variants() 3991 3992 # Check if not empty 3993 log.debug("Check if not empty") 3994 sql_query_chromosomes = ( 3995 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3996 ) 3997 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 3998 log.info(f"VCF empty") 3999 return False 4000 4001 # VCF header 4002 vcf_reader = self.get_header() 4003 log.debug("Initial header: " + str(vcf_reader.infos)) 4004 4005 # Samples 4006 samples = self.get_header_sample_list() 4007 if not samples: 4008 log.error("No Samples in VCF") 4009 return False 4010 log.debug(f"Samples: {samples}") 4011 4012 # Memory limit 4013 memory_limit = self.get_memory("8G") 4014 log.debug(f"memory_limit: {memory_limit}") 4015 4016 # Exomiser java options 4017 exomiser_java_options = ( 4018 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4019 ) 4020 log.debug(f"Exomiser java options: {exomiser_java_options}") 4021 4022 # Download Exomiser (if not exists) 4023 exomiser_release = param_exomiser.get("release", None) 4024 exomiser_application_properties = param_exomiser.get( 4025 "exomiser_application_properties", None 4026 ) 4027 databases_download_exomiser( 4028 assemblies=[assembly], 4029 exomiser_folder=databases_folders, 4030 exomiser_release=exomiser_release, 4031 exomiser_phenotype_release=exomiser_release, 4032 exomiser_application_properties=exomiser_application_properties, 4033 ) 4034 4035 # Force annotation 4036 force_update_annotation = True 4037 4038 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4039 log.debug("Start annotation Exomiser") 4040 4041 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4042 4043 # tmp_dir = "/tmp/exomiser" 4044 4045 ### ANALYSIS ### 4046 ################ 4047 4048 # Create analysis.json through analysis dict 4049 # either analysis in param or by default 4050 # depending on preset exome/genome) 4051 4052 # Init analysis dict 4053 param_exomiser_analysis_dict = {} 4054 4055 # analysis from param 4056 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4057 param_exomiser_analysis = full_path(param_exomiser_analysis) 4058 4059 # If analysis in param -> load anlaysis json 4060 if param_exomiser_analysis: 4061 4062 # If param analysis is a file and exists 4063 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4064 param_exomiser_analysis 4065 ): 4066 # Load analysis file into analysis dict (either yaml or json) 4067 with open(param_exomiser_analysis) as json_file: 4068 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4069 4070 # If param analysis is a dict 4071 elif isinstance(param_exomiser_analysis, dict): 4072 # Load analysis dict into analysis dict (either yaml or json) 4073 param_exomiser_analysis_dict = param_exomiser_analysis 4074 4075 # Error analysis type 4076 else: 4077 log.error(f"Analysis type unknown. Check param file.") 4078 raise ValueError(f"Analysis type unknown. Check param file.") 4079 4080 # Case no input analysis config file/dict 4081 # Use preset (exome/genome) to open default config file 4082 if not param_exomiser_analysis_dict: 4083 4084 # default preset 4085 default_preset = "exome" 4086 4087 # Get param preset or default preset 4088 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4089 4090 # Try to find if preset is a file 4091 if os.path.exists(param_exomiser_preset): 4092 # Preset file is provided in full path 4093 param_exomiser_analysis_default_config_file = ( 4094 param_exomiser_preset 4095 ) 4096 # elif os.path.exists(full_path(param_exomiser_preset)): 4097 # # Preset file is provided in full path 4098 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4099 elif os.path.exists( 4100 os.path.join(folder_config, param_exomiser_preset) 4101 ): 4102 # Preset file is provided a basename in config folder (can be a path with subfolders) 4103 param_exomiser_analysis_default_config_file = os.path.join( 4104 folder_config, param_exomiser_preset 4105 ) 4106 else: 4107 # Construct preset file 4108 param_exomiser_analysis_default_config_file = os.path.join( 4109 folder_config, 4110 f"preset-{param_exomiser_preset}-analysis.json", 4111 ) 4112 4113 # If preset file exists 4114 param_exomiser_analysis_default_config_file = full_path( 4115 param_exomiser_analysis_default_config_file 4116 ) 4117 if os.path.exists(param_exomiser_analysis_default_config_file): 4118 # Load prest file into analysis dict (either yaml or json) 4119 with open( 4120 param_exomiser_analysis_default_config_file 4121 ) as json_file: 4122 # param_exomiser_analysis_dict[""] = json.load(json_file) 4123 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4124 json_file 4125 ) 4126 4127 # Error preset file 4128 else: 4129 log.error( 4130 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4131 ) 4132 raise ValueError( 4133 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4134 ) 4135 4136 # If no analysis dict created 4137 if not param_exomiser_analysis_dict: 4138 log.error(f"No analysis config") 4139 raise ValueError(f"No analysis config") 4140 4141 # Log 4142 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4143 4144 ### PHENOPACKET ### 4145 ################### 4146 4147 # If no PhenoPacket in analysis dict -> check in param 4148 if "phenopacket" not in param_exomiser_analysis_dict: 4149 4150 # If PhenoPacket in param -> load anlaysis json 4151 if param_exomiser.get("phenopacket", None): 4152 4153 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4154 param_exomiser_phenopacket = full_path( 4155 param_exomiser_phenopacket 4156 ) 4157 4158 # If param phenopacket is a file and exists 4159 if isinstance( 4160 param_exomiser_phenopacket, str 4161 ) and os.path.exists(param_exomiser_phenopacket): 4162 # Load phenopacket file into analysis dict (either yaml or json) 4163 with open(param_exomiser_phenopacket) as json_file: 4164 param_exomiser_analysis_dict["phenopacket"] = ( 4165 yaml.safe_load(json_file) 4166 ) 4167 4168 # If param phenopacket is a dict 4169 elif isinstance(param_exomiser_phenopacket, dict): 4170 # Load phenopacket dict into analysis dict (either yaml or json) 4171 param_exomiser_analysis_dict["phenopacket"] = ( 4172 param_exomiser_phenopacket 4173 ) 4174 4175 # Error phenopacket type 4176 else: 4177 log.error(f"Phenopacket type unknown. Check param file.") 4178 raise ValueError( 4179 f"Phenopacket type unknown. Check param file." 4180 ) 4181 4182 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4183 if "phenopacket" not in param_exomiser_analysis_dict: 4184 4185 # Init PhenoPacket 4186 param_exomiser_analysis_dict["phenopacket"] = { 4187 "id": "analysis", 4188 "proband": {}, 4189 } 4190 4191 ### Add subject ### 4192 4193 # If subject exists 4194 param_exomiser_subject = param_exomiser.get("subject", {}) 4195 4196 # If subject not exists -> found sample ID 4197 if not param_exomiser_subject: 4198 4199 # Found sample ID in param 4200 sample = param_exomiser.get("sample", None) 4201 4202 # Find sample ID (first sample) 4203 if not sample: 4204 sample_list = self.get_header_sample_list() 4205 if len(sample_list) > 0: 4206 sample = sample_list[0] 4207 else: 4208 log.error(f"No sample found") 4209 raise ValueError(f"No sample found") 4210 4211 # Create subject 4212 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4213 4214 # Add to dict 4215 param_exomiser_analysis_dict["phenopacket"][ 4216 "subject" 4217 ] = param_exomiser_subject 4218 4219 ### Add "phenotypicFeatures" ### 4220 4221 # If phenotypicFeatures exists 4222 param_exomiser_phenotypicfeatures = param_exomiser.get( 4223 "phenotypicFeatures", [] 4224 ) 4225 4226 # If phenotypicFeatures not exists -> Try to infer from hpo list 4227 if not param_exomiser_phenotypicfeatures: 4228 4229 # Found HPO in param 4230 param_exomiser_hpo = param_exomiser.get("hpo", []) 4231 4232 # Split HPO if list in string format separated by comma 4233 if isinstance(param_exomiser_hpo, str): 4234 param_exomiser_hpo = param_exomiser_hpo.split(",") 4235 4236 # Create HPO list 4237 for hpo in param_exomiser_hpo: 4238 hpo_clean = re.sub("[^0-9]", "", hpo) 4239 param_exomiser_phenotypicfeatures.append( 4240 { 4241 "type": { 4242 "id": f"HP:{hpo_clean}", 4243 "label": f"HP:{hpo_clean}", 4244 } 4245 } 4246 ) 4247 4248 # Add to dict 4249 param_exomiser_analysis_dict["phenopacket"][ 4250 "phenotypicFeatures" 4251 ] = param_exomiser_phenotypicfeatures 4252 4253 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4254 if not param_exomiser_phenotypicfeatures: 4255 for step in param_exomiser_analysis_dict.get( 4256 "analysis", {} 4257 ).get("steps", []): 4258 if "hiPhivePrioritiser" in step: 4259 param_exomiser_analysis_dict.get("analysis", {}).get( 4260 "steps", [] 4261 ).remove(step) 4262 4263 ### Add Input File ### 4264 4265 # Initial file name and htsFiles 4266 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4267 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4268 { 4269 "uri": tmp_vcf_name, 4270 "htsFormat": "VCF", 4271 "genomeAssembly": assembly, 4272 } 4273 ] 4274 4275 ### Add metaData ### 4276 4277 # If metaData not in analysis dict 4278 if "metaData" not in param_exomiser_analysis_dict: 4279 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4280 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4281 "createdBy": "howard", 4282 "phenopacketSchemaVersion": 1, 4283 } 4284 4285 ### OutputOptions ### 4286 4287 # Init output result folder 4288 output_results = os.path.join(tmp_dir, "results") 4289 4290 # If no outputOptions in analysis dict 4291 if "outputOptions" not in param_exomiser_analysis_dict: 4292 4293 # default output formats 4294 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4295 4296 # Get outputOptions in param 4297 output_options = param_exomiser.get("outputOptions", None) 4298 4299 # If no output_options in param -> check 4300 if not output_options: 4301 output_options = { 4302 "outputContributingVariantsOnly": False, 4303 "numGenes": 0, 4304 "outputFormats": defaut_output_formats, 4305 } 4306 4307 # Replace outputDirectory in output options 4308 output_options["outputDirectory"] = output_results 4309 output_options["outputFileName"] = "howard" 4310 4311 # Add outputOptions in analysis dict 4312 param_exomiser_analysis_dict["outputOptions"] = output_options 4313 4314 else: 4315 4316 # Replace output_results and output format (if exists in param) 4317 param_exomiser_analysis_dict["outputOptions"][ 4318 "outputDirectory" 4319 ] = output_results 4320 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4321 list( 4322 set( 4323 param_exomiser_analysis_dict.get( 4324 "outputOptions", {} 4325 ).get("outputFormats", []) 4326 + ["TSV_VARIANT", "VCF"] 4327 ) 4328 ) 4329 ) 4330 4331 # log 4332 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4333 4334 ### ANALYSIS FILE ### 4335 ##################### 4336 4337 ### Full JSON analysis config file ### 4338 4339 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4340 with open(exomiser_analysis, "w") as fp: 4341 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4342 4343 ### SPLIT analysis and sample config files 4344 4345 # Splitted analysis dict 4346 param_exomiser_analysis_dict_for_split = ( 4347 param_exomiser_analysis_dict.copy() 4348 ) 4349 4350 # Phenopacket JSON file 4351 exomiser_analysis_phenopacket = os.path.join( 4352 tmp_dir, "analysis_phenopacket.json" 4353 ) 4354 with open(exomiser_analysis_phenopacket, "w") as fp: 4355 json.dump( 4356 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4357 fp, 4358 indent=4, 4359 ) 4360 4361 # Analysis JSON file without Phenopacket parameters 4362 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4363 exomiser_analysis_analysis = os.path.join( 4364 tmp_dir, "analysis_analysis.json" 4365 ) 4366 with open(exomiser_analysis_analysis, "w") as fp: 4367 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4368 4369 ### INITAL VCF file ### 4370 ####################### 4371 4372 ### Create list of samples to use and include inti initial VCF file #### 4373 4374 # Subject (main sample) 4375 # Get sample ID in analysis dict 4376 sample_subject = ( 4377 param_exomiser_analysis_dict.get("phenopacket", {}) 4378 .get("subject", {}) 4379 .get("id", None) 4380 ) 4381 sample_proband = ( 4382 param_exomiser_analysis_dict.get("phenopacket", {}) 4383 .get("proband", {}) 4384 .get("subject", {}) 4385 .get("id", None) 4386 ) 4387 sample = [] 4388 if sample_subject: 4389 sample.append(sample_subject) 4390 if sample_proband: 4391 sample.append(sample_proband) 4392 4393 # Get sample ID within Pedigree 4394 pedigree_persons_list = ( 4395 param_exomiser_analysis_dict.get("phenopacket", {}) 4396 .get("pedigree", {}) 4397 .get("persons", {}) 4398 ) 4399 4400 # Create list with all sample ID in pedigree (if exists) 4401 pedigree_persons = [] 4402 for person in pedigree_persons_list: 4403 pedigree_persons.append(person.get("individualId")) 4404 4405 # Concat subject sample ID and samples ID in pedigreesamples 4406 samples = list(set(sample + pedigree_persons)) 4407 4408 # Check if sample list is not empty 4409 if not samples: 4410 log.error(f"No samples found") 4411 raise ValueError(f"No samples found") 4412 4413 # Create VCF with sample (either sample in param or first one by default) 4414 # Export VCF file 4415 self.export_variant_vcf( 4416 vcf_file=tmp_vcf_name, 4417 remove_info=True, 4418 add_samples=True, 4419 list_samples=samples, 4420 index=False, 4421 ) 4422 4423 ### Execute Exomiser ### 4424 ######################## 4425 4426 # Init command 4427 exomiser_command = "" 4428 4429 # Command exomiser options 4430 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 4431 4432 # Release 4433 exomiser_release = param_exomiser.get("release", None) 4434 if exomiser_release: 4435 # phenotype data version 4436 exomiser_options += ( 4437 f" --exomiser.phenotype.data-version={exomiser_release} " 4438 ) 4439 # data version 4440 exomiser_options += ( 4441 f" --exomiser.{assembly}.data-version={exomiser_release} " 4442 ) 4443 # variant white list 4444 variant_white_list_file = ( 4445 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 4446 ) 4447 if os.path.exists( 4448 os.path.join( 4449 databases_folders, assembly, variant_white_list_file 4450 ) 4451 ): 4452 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 4453 4454 # transcript_source 4455 transcript_source = param_exomiser.get( 4456 "transcript_source", None 4457 ) # ucsc, refseq, ensembl 4458 if transcript_source: 4459 exomiser_options += ( 4460 f" --exomiser.{assembly}.transcript-source={transcript_source} " 4461 ) 4462 4463 # If analysis contain proband param 4464 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 4465 "proband", {} 4466 ): 4467 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 4468 4469 # If no proband (usually uniq sample) 4470 else: 4471 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 4472 4473 # Log 4474 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 4475 4476 # Run command 4477 result = subprocess.call( 4478 exomiser_command_analysis.split(), stdout=subprocess.PIPE 4479 ) 4480 if result: 4481 log.error("Exomiser command failed") 4482 raise ValueError("Exomiser command failed") 4483 4484 ### RESULTS ### 4485 ############### 4486 4487 ### Annotate with TSV fields ### 4488 4489 # Init result tsv file 4490 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 4491 4492 # Init result tsv file 4493 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 4494 4495 # Parse TSV file and explode columns in INFO field 4496 if exomiser_to_info and os.path.exists(output_results_tsv): 4497 4498 # Log 4499 log.debug("Exomiser columns to VCF INFO field") 4500 4501 # Retrieve columns and types 4502 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 4503 output_results_tsv_df = self.get_query_to_df(query) 4504 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 4505 4506 # Init concat fields for update 4507 sql_query_update_concat_fields = [] 4508 4509 # Fields to avoid 4510 fields_to_avoid = [ 4511 "CONTIG", 4512 "START", 4513 "END", 4514 "REF", 4515 "ALT", 4516 "QUAL", 4517 "FILTER", 4518 "GENOTYPE", 4519 ] 4520 4521 # List all columns to add into header 4522 for header_column in output_results_tsv_columns: 4523 4524 # If header column is enable 4525 if header_column not in fields_to_avoid: 4526 4527 # Header info type 4528 header_info_type = "String" 4529 header_column_df = output_results_tsv_df[header_column] 4530 header_column_df_dtype = header_column_df.dtype 4531 if header_column_df_dtype == object: 4532 if ( 4533 pd.to_numeric(header_column_df, errors="coerce") 4534 .notnull() 4535 .all() 4536 ): 4537 header_info_type = "Float" 4538 else: 4539 header_info_type = "Integer" 4540 4541 # Header info 4542 characters_to_validate = ["-"] 4543 pattern = "[" + "".join(characters_to_validate) + "]" 4544 header_info_name = re.sub( 4545 pattern, 4546 "_", 4547 f"Exomiser_{header_column}".replace("#", ""), 4548 ) 4549 header_info_number = "." 4550 header_info_description = ( 4551 f"Exomiser {header_column} annotation" 4552 ) 4553 header_info_source = "Exomiser" 4554 header_info_version = "unknown" 4555 header_info_code = CODE_TYPE_MAP[header_info_type] 4556 vcf_reader.infos[header_info_name] = vcf.parser._Info( 4557 header_info_name, 4558 header_info_number, 4559 header_info_type, 4560 header_info_description, 4561 header_info_source, 4562 header_info_version, 4563 header_info_code, 4564 ) 4565 4566 # Add field to add for update to concat fields 4567 sql_query_update_concat_fields.append( 4568 f""" 4569 CASE 4570 WHEN table_parquet."{header_column}" NOT IN ('','.') 4571 THEN concat( 4572 '{header_info_name}=', 4573 table_parquet."{header_column}", 4574 ';' 4575 ) 4576 4577 ELSE '' 4578 END 4579 """ 4580 ) 4581 4582 # Update query 4583 sql_query_update = f""" 4584 UPDATE {table_variants} as table_variants 4585 SET INFO = concat( 4586 CASE 4587 WHEN INFO NOT IN ('', '.') 4588 THEN INFO 4589 ELSE '' 4590 END, 4591 CASE 4592 WHEN table_variants.INFO NOT IN ('','.') 4593 THEN ';' 4594 ELSE '' 4595 END, 4596 ( 4597 SELECT 4598 concat( 4599 {",".join(sql_query_update_concat_fields)} 4600 ) 4601 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 4602 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 4603 AND table_parquet.\"START\" = table_variants.\"POS\" 4604 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 4605 AND table_parquet.\"REF\" = table_variants.\"REF\" 4606 ) 4607 ) 4608 ; 4609 """ 4610 4611 # Update 4612 self.conn.execute(sql_query_update) 4613 4614 ### Annotate with VCF INFO field ### 4615 4616 # Init result VCF file 4617 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 4618 4619 # If VCF exists 4620 if os.path.exists(output_results_vcf): 4621 4622 # Log 4623 log.debug("Exomiser result VCF update variants") 4624 4625 # Find Exomiser INFO field annotation in header 4626 with gzip.open(output_results_vcf, "rt") as f: 4627 header_list = self.read_vcf_header(f) 4628 exomiser_vcf_header = vcf.Reader( 4629 io.StringIO("\n".join(header_list)) 4630 ) 4631 4632 # Add annotation INFO field to header 4633 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 4634 4635 # Update variants with VCF 4636 self.update_from_vcf(output_results_vcf) 4637 4638 return True 4639 4640 def annotation_snpeff(self, threads: int = None) -> None: 4641 """ 4642 This function annotate with snpEff 4643 4644 :param threads: The number of threads to use 4645 :return: the value of the variable "return_value". 4646 """ 4647 4648 # DEBUG 4649 log.debug("Start annotation with snpeff databases") 4650 4651 # Threads 4652 if not threads: 4653 threads = self.get_threads() 4654 log.debug("Threads: " + str(threads)) 4655 4656 # DEBUG 4657 delete_tmp = True 4658 if self.get_config().get("verbosity", "warning") in ["debug"]: 4659 delete_tmp = False 4660 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4661 4662 # Config 4663 config = self.get_config() 4664 log.debug("Config: " + str(config)) 4665 4666 # Config - Folders - Databases 4667 databases_folders = ( 4668 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 4669 ) 4670 log.debug("Databases annotations: " + str(databases_folders)) 4671 4672 # # Config - Java 4673 # java_bin = get_bin( 4674 # tool="java", 4675 # bin="java", 4676 # bin_type="bin", 4677 # config=config, 4678 # default_folder="/usr/bin", 4679 # ) 4680 # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))): 4681 # log.error(f"Annotation failed: no java bin '{java_bin}'") 4682 # raise ValueError(f"Annotation failed: no java bin '{java_bin}'") 4683 4684 # # Config - snpEff bin 4685 # snpeff_jar = get_bin( 4686 # tool="snpeff", 4687 # bin="snpEff.jar", 4688 # bin_type="jar", 4689 # config=config, 4690 # default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4691 # ) 4692 # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))): 4693 # log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4694 # raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4695 4696 # Config - snpEff bin command 4697 snpeff_bin_command = get_bin_command( 4698 bin="snpEff.jar", 4699 tool="snpeff", 4700 bin_type="jar", 4701 config=config, 4702 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4703 ) 4704 if not snpeff_bin_command: 4705 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 4706 log.error(msg_err) 4707 raise ValueError(msg_err) 4708 4709 # Config - snpEff databases 4710 snpeff_databases = ( 4711 config.get("folders", {}) 4712 .get("databases", {}) 4713 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 4714 ) 4715 snpeff_databases = full_path(snpeff_databases) 4716 if snpeff_databases is not None and snpeff_databases != "": 4717 log.debug(f"Create snpEff databases folder") 4718 if not os.path.exists(snpeff_databases): 4719 os.makedirs(snpeff_databases) 4720 4721 # Param 4722 param = self.get_param() 4723 log.debug("Param: " + str(param)) 4724 4725 # Param 4726 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 4727 log.debug("Options: " + str(options)) 4728 4729 # Param - Assembly 4730 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4731 4732 # Param - Options 4733 snpeff_options = ( 4734 param.get("annotation", {}).get("snpeff", {}).get("options", "") 4735 ) 4736 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 4737 snpeff_csvstats = ( 4738 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 4739 ) 4740 if snpeff_stats: 4741 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 4742 snpeff_stats = full_path(snpeff_stats) 4743 snpeff_options += f" -stats {snpeff_stats}" 4744 if snpeff_csvstats: 4745 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 4746 snpeff_csvstats = full_path(snpeff_csvstats) 4747 snpeff_options += f" -csvStats {snpeff_csvstats}" 4748 4749 # Data 4750 table_variants = self.get_table_variants() 4751 4752 # Check if not empty 4753 log.debug("Check if not empty") 4754 sql_query_chromosomes = ( 4755 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4756 ) 4757 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 4758 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4759 log.info(f"VCF empty") 4760 return 4761 4762 # Export in VCF 4763 log.debug("Create initial file to annotate") 4764 tmp_vcf = NamedTemporaryFile( 4765 prefix=self.get_prefix(), 4766 dir=self.get_tmp_dir(), 4767 suffix=".vcf.gz", 4768 delete=True, 4769 ) 4770 tmp_vcf_name = tmp_vcf.name 4771 4772 # VCF header 4773 vcf_reader = self.get_header() 4774 log.debug("Initial header: " + str(vcf_reader.infos)) 4775 4776 # Existing annotations 4777 for vcf_annotation in self.get_header().infos: 4778 4779 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4780 log.debug( 4781 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4782 ) 4783 4784 # Memory limit 4785 # if config.get("memory", None): 4786 # memory_limit = config.get("memory", "8G") 4787 # else: 4788 # memory_limit = "8G" 4789 memory_limit = self.get_memory("8G") 4790 log.debug(f"memory_limit: {memory_limit}") 4791 4792 # snpEff java options 4793 snpeff_java_options = ( 4794 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4795 ) 4796 log.debug(f"Exomiser java options: {snpeff_java_options}") 4797 4798 force_update_annotation = True 4799 4800 if "ANN" not in self.get_header().infos or force_update_annotation: 4801 4802 # Check snpEff database 4803 log.debug(f"Check snpEff databases {[assembly]}") 4804 databases_download_snpeff( 4805 folder=snpeff_databases, assemblies=[assembly], config=config 4806 ) 4807 4808 # Export VCF file 4809 self.export_variant_vcf( 4810 vcf_file=tmp_vcf_name, 4811 remove_info=True, 4812 add_samples=False, 4813 index=True, 4814 ) 4815 4816 # Tmp file 4817 err_files = [] 4818 tmp_annotate_vcf = NamedTemporaryFile( 4819 prefix=self.get_prefix(), 4820 dir=self.get_tmp_dir(), 4821 suffix=".vcf", 4822 delete=False, 4823 ) 4824 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4825 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4826 err_files.append(tmp_annotate_vcf_name_err) 4827 4828 # Command 4829 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 4830 log.debug(f"Annotation - snpEff command: {snpeff_command}") 4831 run_parallel_commands([snpeff_command], 1) 4832 4833 # Error messages 4834 log.info(f"Error/Warning messages:") 4835 error_message_command_all = [] 4836 error_message_command_warning = [] 4837 error_message_command_err = [] 4838 for err_file in err_files: 4839 with open(err_file, "r") as f: 4840 for line in f: 4841 message = line.strip() 4842 error_message_command_all.append(message) 4843 if line.startswith("[W::"): 4844 error_message_command_warning.append(message) 4845 if line.startswith("[E::"): 4846 error_message_command_err.append(f"{err_file}: " + message) 4847 # log info 4848 for message in list( 4849 set(error_message_command_err + error_message_command_warning) 4850 ): 4851 log.info(f" {message}") 4852 # debug info 4853 for message in list(set(error_message_command_all)): 4854 log.debug(f" {message}") 4855 # failed 4856 if len(error_message_command_err): 4857 log.error("Annotation failed: Error in commands") 4858 raise ValueError("Annotation failed: Error in commands") 4859 4860 # Find annotation in header 4861 with open(tmp_annotate_vcf_name, "rt") as f: 4862 header_list = self.read_vcf_header(f) 4863 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 4864 4865 for ann in annovar_vcf_header.infos: 4866 if ann not in self.get_header().infos: 4867 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 4868 4869 # Update variants 4870 log.info(f"Annotation - Updating...") 4871 self.update_from_vcf(tmp_annotate_vcf_name) 4872 4873 else: 4874 if "ANN" in self.get_header().infos: 4875 log.debug(f"Existing snpEff annotations in VCF") 4876 if force_update_annotation: 4877 log.debug(f"Existing snpEff annotations in VCF - annotation forced") 4878 4879 def annotation_annovar(self, threads: int = None) -> None: 4880 """ 4881 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 4882 annotations 4883 4884 :param threads: number of threads to use 4885 :return: the value of the variable "return_value". 4886 """ 4887 4888 # DEBUG 4889 log.debug("Start annotation with Annovar databases") 4890 4891 # Threads 4892 if not threads: 4893 threads = self.get_threads() 4894 log.debug("Threads: " + str(threads)) 4895 4896 # Tmp en Err files 4897 tmp_files = [] 4898 err_files = [] 4899 4900 # DEBUG 4901 delete_tmp = True 4902 if self.get_config().get("verbosity", "warning") in ["debug"]: 4903 delete_tmp = False 4904 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4905 4906 # Config 4907 config = self.get_config() 4908 log.debug("Config: " + str(config)) 4909 4910 # Config - Folders - Databases 4911 databases_folders = ( 4912 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 4913 ) 4914 log.debug("Databases annotations: " + str(databases_folders)) 4915 4916 # Config - annovar bin command 4917 annovar_bin_command = get_bin_command( 4918 bin="table_annovar.pl", 4919 tool="annovar", 4920 bin_type="perl", 4921 config=config, 4922 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 4923 ) 4924 if not annovar_bin_command: 4925 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 4926 log.error(msg_err) 4927 raise ValueError(msg_err) 4928 4929 # Config - BCFTools bin command 4930 bcftools_bin_command = get_bin_command( 4931 bin="bcftools", 4932 tool="bcftools", 4933 bin_type="bin", 4934 config=config, 4935 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 4936 ) 4937 if not bcftools_bin_command: 4938 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 4939 log.error(msg_err) 4940 raise ValueError(msg_err) 4941 4942 # Config - annovar databases 4943 annovar_databases = ( 4944 config.get("folders", {}) 4945 .get("databases", {}) 4946 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 4947 ) 4948 annovar_databases = full_path(annovar_databases) 4949 if annovar_databases != "" and not os.path.exists(annovar_databases): 4950 os.makedirs(annovar_databases) 4951 4952 # Param 4953 param = self.get_param() 4954 log.debug("Param: " + str(param)) 4955 4956 # Param - options 4957 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 4958 log.debug("Options: " + str(options)) 4959 4960 # Param - annotations 4961 annotations = ( 4962 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 4963 ) 4964 log.debug("Annotations: " + str(annotations)) 4965 4966 # Param - Assembly 4967 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4968 4969 # Annovar database assembly 4970 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 4971 if annovar_databases_assembly != "" and not os.path.exists( 4972 annovar_databases_assembly 4973 ): 4974 os.makedirs(annovar_databases_assembly) 4975 4976 # Data 4977 table_variants = self.get_table_variants() 4978 4979 # Check if not empty 4980 log.debug("Check if not empty") 4981 sql_query_chromosomes = ( 4982 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4983 ) 4984 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 4985 if not sql_query_chromosomes_df["count"][0]: 4986 log.info(f"VCF empty") 4987 return 4988 4989 # VCF header 4990 vcf_reader = self.get_header() 4991 log.debug("Initial header: " + str(vcf_reader.infos)) 4992 4993 # Existing annotations 4994 for vcf_annotation in self.get_header().infos: 4995 4996 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4997 log.debug( 4998 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4999 ) 5000 5001 force_update_annotation = True 5002 5003 if annotations: 5004 5005 commands = [] 5006 tmp_annotates_vcf_name_list = [] 5007 5008 # Export in VCF 5009 log.debug("Create initial file to annotate") 5010 tmp_vcf = NamedTemporaryFile( 5011 prefix=self.get_prefix(), 5012 dir=self.get_tmp_dir(), 5013 suffix=".vcf.gz", 5014 delete=False, 5015 ) 5016 tmp_vcf_name = tmp_vcf.name 5017 tmp_files.append(tmp_vcf_name) 5018 tmp_files.append(tmp_vcf_name + ".tbi") 5019 5020 # Export VCF file 5021 self.export_variant_vcf( 5022 vcf_file=tmp_vcf_name, 5023 remove_info=".", 5024 add_samples=False, 5025 index=True, 5026 ) 5027 5028 # Create file for field rename 5029 log.debug("Create file for field rename") 5030 tmp_rename = NamedTemporaryFile( 5031 prefix=self.get_prefix(), 5032 dir=self.get_tmp_dir(), 5033 suffix=".rename", 5034 delete=False, 5035 ) 5036 tmp_rename_name = tmp_rename.name 5037 tmp_files.append(tmp_rename_name) 5038 5039 # Check Annovar database 5040 log.debug( 5041 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5042 ) 5043 databases_download_annovar( 5044 folder=annovar_databases, 5045 files=list(annotations.keys()), 5046 assemblies=[assembly], 5047 ) 5048 5049 for annotation in annotations: 5050 annotation_fields = annotations[annotation] 5051 5052 if not annotation_fields: 5053 annotation_fields = {"INFO": None} 5054 5055 log.info(f"Annotations Annovar - database '{annotation}'") 5056 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5057 5058 # Tmp file for annovar 5059 err_files = [] 5060 tmp_annotate_vcf_directory = TemporaryDirectory( 5061 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5062 ) 5063 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5064 tmp_annotate_vcf_name_annovar = ( 5065 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5066 ) 5067 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5068 err_files.append(tmp_annotate_vcf_name_err) 5069 tmp_files.append(tmp_annotate_vcf_name_err) 5070 5071 # Tmp file final vcf annotated by annovar 5072 tmp_annotate_vcf = NamedTemporaryFile( 5073 prefix=self.get_prefix(), 5074 dir=self.get_tmp_dir(), 5075 suffix=".vcf.gz", 5076 delete=False, 5077 ) 5078 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5079 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5080 tmp_files.append(tmp_annotate_vcf_name) 5081 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5082 5083 # Number of fields 5084 annotation_list = [] 5085 annotation_renamed_list = [] 5086 5087 for annotation_field in annotation_fields: 5088 5089 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5090 annotation_fields_new_name = annotation_fields.get( 5091 annotation_field, annotation_field 5092 ) 5093 if not annotation_fields_new_name: 5094 annotation_fields_new_name = annotation_field 5095 5096 if ( 5097 force_update_annotation 5098 or annotation_fields_new_name not in self.get_header().infos 5099 ): 5100 annotation_list.append(annotation_field) 5101 annotation_renamed_list.append(annotation_fields_new_name) 5102 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5103 log.warning( 5104 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5105 ) 5106 5107 # Add rename info 5108 run_parallel_commands( 5109 [ 5110 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5111 ], 5112 1, 5113 ) 5114 5115 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5116 log.debug("annotation_list: " + str(annotation_list)) 5117 5118 # protocol 5119 protocol = annotation 5120 5121 # argument 5122 argument = "" 5123 5124 # operation 5125 operation = "f" 5126 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5127 "ensGene" 5128 ): 5129 operation = "g" 5130 if options.get("genebase", None): 5131 argument = f"""'{options.get("genebase","")}'""" 5132 elif annotation in ["cytoBand"]: 5133 operation = "r" 5134 5135 # argument option 5136 argument_option = "" 5137 if argument != "": 5138 argument_option = " --argument " + argument 5139 5140 # command options 5141 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5142 for option in options: 5143 if option not in ["genebase"]: 5144 command_options += f""" --{option}={options[option]}""" 5145 5146 # Command 5147 5148 # Command - Annovar 5149 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5150 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5151 5152 # Command - start pipe 5153 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5154 5155 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5156 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5157 5158 # Command - Special characters (refGene annotation) 5159 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5160 5161 # Command - Clean empty fields (with value ".") 5162 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5163 5164 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5165 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5166 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5167 # for ann in annotation_renamed_list: 5168 for ann in annotation_list: 5169 annovar_fields_to_keep.append(f"^INFO/{ann}") 5170 5171 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5172 5173 # Command - indexing 5174 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5175 5176 log.debug(f"Annotation - Annovar command: {command_annovar}") 5177 run_parallel_commands([command_annovar], 1) 5178 5179 # Error messages 5180 log.info(f"Error/Warning messages:") 5181 error_message_command_all = [] 5182 error_message_command_warning = [] 5183 error_message_command_err = [] 5184 for err_file in err_files: 5185 with open(err_file, "r") as f: 5186 for line in f: 5187 message = line.strip() 5188 error_message_command_all.append(message) 5189 if line.startswith("[W::") or line.startswith("WARNING"): 5190 error_message_command_warning.append(message) 5191 if line.startswith("[E::") or line.startswith("ERROR"): 5192 error_message_command_err.append( 5193 f"{err_file}: " + message 5194 ) 5195 # log info 5196 for message in list( 5197 set(error_message_command_err + error_message_command_warning) 5198 ): 5199 log.info(f" {message}") 5200 # debug info 5201 for message in list(set(error_message_command_all)): 5202 log.debug(f" {message}") 5203 # failed 5204 if len(error_message_command_err): 5205 log.error("Annotation failed: Error in commands") 5206 raise ValueError("Annotation failed: Error in commands") 5207 5208 if tmp_annotates_vcf_name_list: 5209 5210 # List of annotated files 5211 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5212 5213 # Tmp file 5214 tmp_annotate_vcf = NamedTemporaryFile( 5215 prefix=self.get_prefix(), 5216 dir=self.get_tmp_dir(), 5217 suffix=".vcf.gz", 5218 delete=False, 5219 ) 5220 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5221 tmp_files.append(tmp_annotate_vcf_name) 5222 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5223 err_files.append(tmp_annotate_vcf_name_err) 5224 tmp_files.append(tmp_annotate_vcf_name_err) 5225 5226 # Command merge 5227 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5228 log.info( 5229 f"Annotation Annovar - Annotation merging " 5230 + str(len(tmp_annotates_vcf_name_list)) 5231 + " annotated files" 5232 ) 5233 log.debug(f"Annotation - merge command: {merge_command}") 5234 run_parallel_commands([merge_command], 1) 5235 5236 # Find annotation in header 5237 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5238 header_list = self.read_vcf_header(f) 5239 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5240 5241 for ann in annovar_vcf_header.infos: 5242 if ann not in self.get_header().infos: 5243 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5244 5245 # Update variants 5246 log.info(f"Annotation Annovar - Updating...") 5247 self.update_from_vcf(tmp_annotate_vcf_name) 5248 5249 # Clean files 5250 # Tmp file remove command 5251 if True: 5252 tmp_files_remove_command = "" 5253 if tmp_files: 5254 tmp_files_remove_command = " ".join(tmp_files) 5255 clean_command = f" rm -f {tmp_files_remove_command} " 5256 log.debug(f"Annotation Annovar - Annotation cleaning ") 5257 log.debug(f"Annotation - cleaning command: {clean_command}") 5258 run_parallel_commands([clean_command], 1) 5259 5260 # Parquet 5261 def annotation_parquet(self, threads: int = None) -> None: 5262 """ 5263 It takes a VCF file, and annotates it with a parquet file 5264 5265 :param threads: number of threads to use for the annotation 5266 :return: the value of the variable "result". 5267 """ 5268 5269 # DEBUG 5270 log.debug("Start annotation with parquet databases") 5271 5272 # Threads 5273 if not threads: 5274 threads = self.get_threads() 5275 log.debug("Threads: " + str(threads)) 5276 5277 # DEBUG 5278 delete_tmp = True 5279 if self.get_config().get("verbosity", "warning") in ["debug"]: 5280 delete_tmp = False 5281 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5282 5283 # Config 5284 databases_folders = set( 5285 self.get_config() 5286 .get("folders", {}) 5287 .get("databases", {}) 5288 .get("annotations", ["."]) 5289 + self.get_config() 5290 .get("folders", {}) 5291 .get("databases", {}) 5292 .get("parquet", ["."]) 5293 ) 5294 log.debug("Databases annotations: " + str(databases_folders)) 5295 5296 # Param 5297 annotations = ( 5298 self.get_param() 5299 .get("annotation", {}) 5300 .get("parquet", {}) 5301 .get("annotations", None) 5302 ) 5303 log.debug("Annotations: " + str(annotations)) 5304 5305 # Assembly 5306 assembly = self.get_param().get( 5307 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5308 ) 5309 5310 # Force Update Annotation 5311 force_update_annotation = ( 5312 self.get_param() 5313 .get("annotation", {}) 5314 .get("options", {}) 5315 .get("annotations_update", False) 5316 ) 5317 log.debug(f"force_update_annotation={force_update_annotation}") 5318 force_append_annotation = ( 5319 self.get_param() 5320 .get("annotation", {}) 5321 .get("options", {}) 5322 .get("annotations_append", False) 5323 ) 5324 log.debug(f"force_append_annotation={force_append_annotation}") 5325 5326 # Data 5327 table_variants = self.get_table_variants() 5328 5329 # Check if not empty 5330 log.debug("Check if not empty") 5331 sql_query_chromosomes_df = self.get_query_to_df( 5332 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5333 ) 5334 if not sql_query_chromosomes_df["count"][0]: 5335 log.info(f"VCF empty") 5336 return 5337 5338 # VCF header 5339 vcf_reader = self.get_header() 5340 log.debug("Initial header: " + str(vcf_reader.infos)) 5341 5342 # Nb Variants POS 5343 log.debug("NB Variants Start") 5344 nb_variants = self.conn.execute( 5345 f"SELECT count(*) AS count FROM variants" 5346 ).fetchdf()["count"][0] 5347 log.debug("NB Variants Stop") 5348 5349 # Existing annotations 5350 for vcf_annotation in self.get_header().infos: 5351 5352 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5353 log.debug( 5354 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5355 ) 5356 5357 # Added columns 5358 added_columns = [] 5359 5360 # drop indexes 5361 log.debug(f"Drop indexes...") 5362 self.drop_indexes() 5363 5364 if annotations: 5365 5366 if "ALL" in annotations: 5367 5368 all_param = annotations.get("ALL", {}) 5369 all_param_formats = all_param.get("formats", None) 5370 all_param_releases = all_param.get("releases", None) 5371 5372 databases_infos_dict = self.scan_databases( 5373 database_formats=all_param_formats, 5374 database_releases=all_param_releases, 5375 ) 5376 for database_infos in databases_infos_dict.keys(): 5377 if database_infos not in annotations: 5378 annotations[database_infos] = {"INFO": None} 5379 5380 for annotation in annotations: 5381 5382 if annotation in ["ALL"]: 5383 continue 5384 5385 # Annotation Name 5386 annotation_name = os.path.basename(annotation) 5387 5388 # Annotation fields 5389 annotation_fields = annotations[annotation] 5390 if not annotation_fields: 5391 annotation_fields = {"INFO": None} 5392 5393 log.debug(f"Annotation '{annotation_name}'") 5394 log.debug( 5395 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5396 ) 5397 5398 # Create Database 5399 database = Database( 5400 database=annotation, 5401 databases_folders=databases_folders, 5402 assembly=assembly, 5403 ) 5404 5405 # Find files 5406 parquet_file = database.get_database() 5407 parquet_hdr_file = database.get_header_file() 5408 parquet_type = database.get_type() 5409 5410 # Check if files exists 5411 if not parquet_file or not parquet_hdr_file: 5412 log.error("Annotation failed: file not found") 5413 raise ValueError("Annotation failed: file not found") 5414 else: 5415 # Get parquet connexion 5416 parquet_sql_attach = database.get_sql_database_attach( 5417 output="query" 5418 ) 5419 if parquet_sql_attach: 5420 self.conn.execute(parquet_sql_attach) 5421 parquet_file_link = database.get_sql_database_link() 5422 # Log 5423 log.debug( 5424 f"Annotation '{annotation_name}' - file: " 5425 + str(parquet_file) 5426 + " and " 5427 + str(parquet_hdr_file) 5428 ) 5429 5430 # Database full header columns 5431 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 5432 parquet_hdr_file 5433 ) 5434 # Log 5435 log.debug( 5436 "Annotation database header columns : " 5437 + str(parquet_hdr_vcf_header_columns) 5438 ) 5439 5440 # Load header as VCF object 5441 parquet_hdr_vcf_header_infos = database.get_header().infos 5442 # Log 5443 log.debug( 5444 "Annotation database header: " 5445 + str(parquet_hdr_vcf_header_infos) 5446 ) 5447 5448 # Get extra infos 5449 parquet_columns = database.get_extra_columns() 5450 # Log 5451 log.debug("Annotation database Columns: " + str(parquet_columns)) 5452 5453 # Add extra columns if "ALL" in annotation_fields 5454 # if "ALL" in annotation_fields: 5455 # allow_add_extra_column = True 5456 if "ALL" in annotation_fields and database.get_extra_columns(): 5457 for extra_column in database.get_extra_columns(): 5458 if ( 5459 extra_column not in annotation_fields 5460 and extra_column.replace("INFO/", "") 5461 not in parquet_hdr_vcf_header_infos 5462 ): 5463 parquet_hdr_vcf_header_infos[extra_column] = ( 5464 vcf.parser._Info( 5465 extra_column, 5466 ".", 5467 "String", 5468 f"{extra_column} description", 5469 "unknown", 5470 "unknown", 5471 self.code_type_map["String"], 5472 ) 5473 ) 5474 5475 # For all fields in database 5476 annotation_fields_all = False 5477 if "ALL" in annotation_fields or "INFO" in annotation_fields: 5478 annotation_fields_all = True 5479 annotation_fields = { 5480 key: key for key in parquet_hdr_vcf_header_infos 5481 } 5482 5483 log.debug( 5484 "Annotation database header - All annotations added: " 5485 + str(annotation_fields) 5486 ) 5487 5488 # Init 5489 5490 # List of annotation fields to use 5491 sql_query_annotation_update_info_sets = [] 5492 5493 # List of annotation to agregate 5494 sql_query_annotation_to_agregate = [] 5495 5496 # Number of fields 5497 nb_annotation_field = 0 5498 5499 # Annotation fields processed 5500 annotation_fields_processed = [] 5501 5502 # Columns mapping 5503 map_columns = database.map_columns( 5504 columns=annotation_fields, prefixes=["INFO/"] 5505 ) 5506 5507 # Query dict for fields to remove (update option) 5508 query_dict_remove = {} 5509 5510 # Fetch Anotation fields 5511 for annotation_field in annotation_fields: 5512 5513 # annotation_field_column 5514 annotation_field_column = map_columns.get( 5515 annotation_field, "INFO" 5516 ) 5517 5518 # field new name, if parametered 5519 annotation_fields_new_name = annotation_fields.get( 5520 annotation_field, annotation_field 5521 ) 5522 if not annotation_fields_new_name: 5523 annotation_fields_new_name = annotation_field 5524 5525 # To annotate 5526 # force_update_annotation = True 5527 # force_append_annotation = True 5528 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 5529 if annotation_field in parquet_hdr_vcf_header_infos and ( 5530 force_update_annotation 5531 or force_append_annotation 5532 or ( 5533 annotation_fields_new_name 5534 not in self.get_header().infos 5535 ) 5536 ): 5537 5538 # Add field to annotation to process list 5539 annotation_fields_processed.append( 5540 annotation_fields_new_name 5541 ) 5542 5543 # explode infos for the field 5544 annotation_fields_new_name_info_msg = "" 5545 if ( 5546 force_update_annotation 5547 and annotation_fields_new_name 5548 in self.get_header().infos 5549 ): 5550 # Remove field from INFO 5551 query = f""" 5552 UPDATE {table_variants} as table_variants 5553 SET INFO = REGEXP_REPLACE( 5554 concat(table_variants.INFO,''), 5555 ';*{annotation_fields_new_name}=[^;]*', 5556 '' 5557 ) 5558 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 5559 """ 5560 annotation_fields_new_name_info_msg = " [update]" 5561 query_dict_remove[ 5562 f"remove 'INFO/{annotation_fields_new_name}'" 5563 ] = query 5564 5565 # Sep between fields in INFO 5566 nb_annotation_field += 1 5567 if nb_annotation_field > 1: 5568 annotation_field_sep = ";" 5569 else: 5570 annotation_field_sep = "" 5571 5572 log.info( 5573 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 5574 ) 5575 5576 # Add INFO field to header 5577 parquet_hdr_vcf_header_infos_number = ( 5578 parquet_hdr_vcf_header_infos[annotation_field].num 5579 or "." 5580 ) 5581 parquet_hdr_vcf_header_infos_type = ( 5582 parquet_hdr_vcf_header_infos[annotation_field].type 5583 or "String" 5584 ) 5585 parquet_hdr_vcf_header_infos_description = ( 5586 parquet_hdr_vcf_header_infos[annotation_field].desc 5587 or f"{annotation_field} description" 5588 ) 5589 parquet_hdr_vcf_header_infos_source = ( 5590 parquet_hdr_vcf_header_infos[annotation_field].source 5591 or "unknown" 5592 ) 5593 parquet_hdr_vcf_header_infos_version = ( 5594 parquet_hdr_vcf_header_infos[annotation_field].version 5595 or "unknown" 5596 ) 5597 5598 vcf_reader.infos[annotation_fields_new_name] = ( 5599 vcf.parser._Info( 5600 annotation_fields_new_name, 5601 parquet_hdr_vcf_header_infos_number, 5602 parquet_hdr_vcf_header_infos_type, 5603 parquet_hdr_vcf_header_infos_description, 5604 parquet_hdr_vcf_header_infos_source, 5605 parquet_hdr_vcf_header_infos_version, 5606 self.code_type_map[ 5607 parquet_hdr_vcf_header_infos_type 5608 ], 5609 ) 5610 ) 5611 5612 # Append 5613 if force_append_annotation: 5614 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 5615 else: 5616 query_case_when_append = "" 5617 5618 # Annotation/Update query fields 5619 # Found in INFO column 5620 if ( 5621 annotation_field_column == "INFO" 5622 and "INFO" in parquet_hdr_vcf_header_columns 5623 ): 5624 sql_query_annotation_update_info_sets.append( 5625 f""" 5626 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 5627 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 5628 ELSE '' 5629 END 5630 """ 5631 ) 5632 # Found in a specific column 5633 else: 5634 sql_query_annotation_update_info_sets.append( 5635 f""" 5636 CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append} 5637 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ',')) 5638 ELSE '' 5639 END 5640 """ 5641 ) 5642 sql_query_annotation_to_agregate.append( 5643 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 5644 ) 5645 5646 # Not to annotate 5647 else: 5648 5649 if force_update_annotation: 5650 annotation_message = "forced" 5651 else: 5652 annotation_message = "skipped" 5653 5654 if annotation_field not in parquet_hdr_vcf_header_infos: 5655 log.warning( 5656 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 5657 ) 5658 if annotation_fields_new_name in self.get_header().infos: 5659 log.warning( 5660 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 5661 ) 5662 5663 # Check if ALL fields have to be annotated. Thus concat all INFO field 5664 # allow_annotation_full_info = True 5665 allow_annotation_full_info = not force_append_annotation 5666 5667 if parquet_type in ["regions"]: 5668 allow_annotation_full_info = False 5669 5670 if ( 5671 allow_annotation_full_info 5672 and nb_annotation_field == len(annotation_fields) 5673 and annotation_fields_all 5674 and ( 5675 "INFO" in parquet_hdr_vcf_header_columns 5676 and "INFO" in database.get_extra_columns() 5677 ) 5678 ): 5679 log.debug("Column INFO annotation enabled") 5680 sql_query_annotation_update_info_sets = [] 5681 sql_query_annotation_update_info_sets.append( 5682 f" table_parquet.INFO " 5683 ) 5684 5685 if sql_query_annotation_update_info_sets: 5686 5687 # Annotate 5688 log.info(f"Annotation '{annotation_name}' - Annotation...") 5689 5690 # Join query annotation update info sets for SQL 5691 sql_query_annotation_update_info_sets_sql = ",".join( 5692 sql_query_annotation_update_info_sets 5693 ) 5694 5695 # Check chromosomes list (and variants infos) 5696 sql_query_chromosomes = f""" 5697 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 5698 FROM {table_variants} as table_variants 5699 GROUP BY table_variants."#CHROM" 5700 ORDER BY table_variants."#CHROM" 5701 """ 5702 sql_query_chromosomes_df = self.conn.execute( 5703 sql_query_chromosomes 5704 ).df() 5705 sql_query_chromosomes_dict = { 5706 entry["CHROM"]: { 5707 "count": entry["count_variants"], 5708 "min": entry["min_variants"], 5709 "max": entry["max_variants"], 5710 } 5711 for index, entry in sql_query_chromosomes_df.iterrows() 5712 } 5713 5714 # Init 5715 nb_of_query = 0 5716 nb_of_variant_annotated = 0 5717 query_dict = query_dict_remove 5718 5719 # for chrom in sql_query_chromosomes_df["CHROM"]: 5720 for chrom in sql_query_chromosomes_dict: 5721 5722 # Number of variant by chromosome 5723 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 5724 chrom, {} 5725 ).get("count", 0) 5726 5727 log.debug( 5728 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 5729 ) 5730 5731 # Annotation with regions database 5732 if parquet_type in ["regions"]: 5733 sql_query_annotation_from_clause = f""" 5734 FROM ( 5735 SELECT 5736 '{chrom}' AS \"#CHROM\", 5737 table_variants_from.\"POS\" AS \"POS\", 5738 {",".join(sql_query_annotation_to_agregate)} 5739 FROM {table_variants} as table_variants_from 5740 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 5741 table_parquet_from."#CHROM" = '{chrom}' 5742 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 5743 AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1) 5744 OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 5745 ) 5746 ) 5747 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 5748 GROUP BY table_variants_from.\"POS\" 5749 ) 5750 as table_parquet 5751 """ 5752 5753 sql_query_annotation_where_clause = """ 5754 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5755 AND table_parquet.\"POS\" = table_variants.\"POS\" 5756 """ 5757 5758 # Annotation with variants database 5759 else: 5760 sql_query_annotation_from_clause = f""" 5761 FROM {parquet_file_link} as table_parquet 5762 """ 5763 sql_query_annotation_where_clause = f""" 5764 table_variants."#CHROM" = '{chrom}' 5765 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5766 AND table_parquet.\"POS\" = table_variants.\"POS\" 5767 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5768 AND table_parquet.\"REF\" = table_variants.\"REF\" 5769 """ 5770 5771 # Create update query 5772 sql_query_annotation_chrom_interval_pos = f""" 5773 UPDATE {table_variants} as table_variants 5774 SET INFO = 5775 concat( 5776 CASE WHEN table_variants.INFO NOT IN ('','.') 5777 THEN table_variants.INFO 5778 ELSE '' 5779 END 5780 , 5781 CASE WHEN table_variants.INFO NOT IN ('','.') 5782 AND ( 5783 concat({sql_query_annotation_update_info_sets_sql}) 5784 ) 5785 NOT IN ('','.') 5786 THEN ';' 5787 ELSE '' 5788 END 5789 , 5790 {sql_query_annotation_update_info_sets_sql} 5791 ) 5792 {sql_query_annotation_from_clause} 5793 WHERE {sql_query_annotation_where_clause} 5794 ; 5795 """ 5796 5797 # Add update query to dict 5798 query_dict[ 5799 f"{chrom} [{nb_of_variant_by_chrom} variants]" 5800 ] = sql_query_annotation_chrom_interval_pos 5801 5802 nb_of_query = len(query_dict) 5803 num_query = 0 5804 5805 # SET max_expression_depth TO x 5806 self.conn.execute("SET max_expression_depth TO 10000") 5807 5808 for query_name in query_dict: 5809 query = query_dict[query_name] 5810 num_query += 1 5811 log.info( 5812 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 5813 ) 5814 result = self.conn.execute(query) 5815 nb_of_variant_annotated_by_query = result.df()["Count"][0] 5816 nb_of_variant_annotated += nb_of_variant_annotated_by_query 5817 log.info( 5818 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 5819 ) 5820 5821 log.info( 5822 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 5823 ) 5824 5825 else: 5826 5827 log.info( 5828 f"Annotation '{annotation_name}' - No Annotations available" 5829 ) 5830 5831 log.debug("Final header: " + str(vcf_reader.infos)) 5832 5833 # Remove added columns 5834 for added_column in added_columns: 5835 self.drop_column(column=added_column) 5836 5837 def annotation_splice(self, threads: int = None) -> None: 5838 """ 5839 This function annotate with snpEff 5840 5841 :param threads: The number of threads to use 5842 :return: the value of the variable "return_value". 5843 """ 5844 5845 # DEBUG 5846 log.debug("Start annotation with splice tools") 5847 5848 # Threads 5849 if not threads: 5850 threads = self.get_threads() 5851 log.debug("Threads: " + str(threads)) 5852 5853 # DEBUG 5854 delete_tmp = True 5855 if self.get_config().get("verbosity", "warning") in ["debug"]: 5856 delete_tmp = False 5857 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5858 5859 # Config 5860 config = self.get_config() 5861 log.debug("Config: " + str(config)) 5862 splice_config = config.get("tools", {}).get("splice", {}) 5863 if not splice_config: 5864 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 5865 if not splice_config: 5866 msg_err = "No Splice tool config" 5867 log.error(msg_err) 5868 raise ValueError(msg_err) 5869 log.debug(f"splice_config={splice_config}") 5870 5871 # Config - Folders - Databases 5872 databases_folders = ( 5873 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 5874 ) 5875 log.debug("Databases annotations: " + str(databases_folders)) 5876 5877 # Splice docker image 5878 splice_docker_image = splice_config.get("docker").get("image") 5879 5880 # Pull splice image if it's not already there 5881 if not check_docker_image_exists(splice_docker_image): 5882 log.warning( 5883 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 5884 ) 5885 try: 5886 command(f"docker pull {splice_config.get('docker').get('image')}") 5887 except subprocess.CalledProcessError: 5888 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 5889 log.error(msg_err) 5890 raise ValueError(msg_err) 5891 return None 5892 5893 # Config - splice databases 5894 splice_databases = ( 5895 config.get("folders", {}) 5896 .get("databases", {}) 5897 .get("splice", DEFAULT_SPLICE_FOLDER) 5898 ) 5899 splice_databases = full_path(splice_databases) 5900 5901 # Param 5902 param = self.get_param() 5903 log.debug("Param: " + str(param)) 5904 5905 # Param 5906 options = param.get("annotation", {}).get("splice", {}) 5907 log.debug("Options: " + str(options)) 5908 5909 # Data 5910 table_variants = self.get_table_variants() 5911 5912 # Check if not empty 5913 log.debug("Check if not empty") 5914 sql_query_chromosomes = ( 5915 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5916 ) 5917 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5918 log.info("VCF empty") 5919 return None 5920 5921 # Export in VCF 5922 log.debug("Create initial file to annotate") 5923 5924 # Create output folder 5925 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 5926 if not os.path.exists(output_folder): 5927 Path(output_folder).mkdir(parents=True, exist_ok=True) 5928 5929 # Create tmp VCF file 5930 tmp_vcf = NamedTemporaryFile( 5931 prefix=self.get_prefix(), 5932 dir=output_folder, 5933 suffix=".vcf", 5934 delete=False, 5935 ) 5936 tmp_vcf_name = tmp_vcf.name 5937 5938 # VCF header 5939 header = self.get_header() 5940 5941 # Existing annotations 5942 for vcf_annotation in self.get_header().infos: 5943 5944 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5945 log.debug( 5946 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5947 ) 5948 5949 # Memory limit 5950 if config.get("memory", None): 5951 memory_limit = config.get("memory", "8G").upper() 5952 # upper() 5953 else: 5954 memory_limit = "8G" 5955 log.debug(f"memory_limit: {memory_limit}") 5956 5957 # Export VCF file 5958 self.export_variant_vcf( 5959 vcf_file=tmp_vcf_name, 5960 remove_info=True, 5961 add_samples=True, 5962 index=False, 5963 ) 5964 5965 # Create docker container and launch splice analysis 5966 if splice_config: 5967 5968 # Splice mount folders 5969 mount_folders = splice_config.get("mount", {}) 5970 5971 # Genome mount 5972 mount_folders[ 5973 config.get("folders", {}) 5974 .get("databases", {}) 5975 .get("genomes", DEFAULT_GENOME_FOLDER) 5976 ] = "ro" 5977 5978 # SpliceAI mount 5979 mount_folders[ 5980 config.get("folders", {}) 5981 .get("databases", {}) 5982 .get("spliceai", DEFAULT_SPLICEAI_FOLDER) 5983 ] = "ro" 5984 5985 # Genome mount 5986 mount_folders[ 5987 config.get("folders", {}) 5988 .get("databases", {}) 5989 .get("spip", DEFAULT_SPIP_FOLDER) 5990 ] = "ro" 5991 5992 # Mount folders 5993 mount = [] 5994 5995 # Config mount 5996 mount = [ 5997 f"-v {full_path(path)}:{full_path(path)}:{mode}" 5998 for path, mode in mount_folders.items() 5999 ] 6000 6001 if any(value for value in splice_config.values() if value is None): 6002 log.warning("At least one splice config parameter is empty") 6003 return None 6004 6005 # Params in splice nf 6006 def check_values(dico: dict): 6007 """ 6008 Ensure parameters for NF splice pipeline 6009 """ 6010 for key, val in dico.items(): 6011 if key == "genome": 6012 if any( 6013 assemb in options.get("genome", {}) 6014 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6015 ): 6016 yield f"--{key} hg19" 6017 elif any( 6018 assemb in options.get("genome", {}) 6019 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6020 ): 6021 yield f"--{key} hg38" 6022 elif ( 6023 (isinstance(val, str) and val) 6024 or isinstance(val, int) 6025 or isinstance(val, bool) 6026 ): 6027 yield f"--{key} {val}" 6028 6029 # Genome 6030 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6031 options["genome"] = genome 6032 6033 # NF params 6034 nf_params = [] 6035 6036 # Add options 6037 if options: 6038 nf_params = list(check_values(options)) 6039 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6040 else: 6041 log.debug("No NF params provided") 6042 6043 # Add threads 6044 if "threads" not in options.keys(): 6045 nf_params.append(f"--threads {threads}") 6046 6047 # Genome path 6048 genome_path = find_genome( 6049 config.get("folders", {}) 6050 .get("databases", {}) 6051 .get("genomes", DEFAULT_GENOME_FOLDER), 6052 file=f"{genome}.fa", 6053 ) 6054 # Add genome path 6055 if not genome_path: 6056 raise ValueError( 6057 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6058 ) 6059 else: 6060 log.debug(f"Genome: {genome_path}") 6061 nf_params.append(f"--genome_path {genome_path}") 6062 6063 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6064 """ 6065 Setting up updated databases for SPiP and SpliceAI 6066 """ 6067 6068 try: 6069 6070 # SpliceAI assembly transcriptome 6071 spliceai_assembly = os.path.join( 6072 config.get("folders", {}) 6073 .get("databases", {}) 6074 .get("spliceai", {}), 6075 options.get("genome"), 6076 "transcriptome", 6077 ) 6078 spip_assembly = options.get("genome") 6079 6080 spip = find( 6081 f"transcriptome_{spip_assembly}.RData", 6082 config.get("folders", {}).get("databases", {}).get("spip", {}), 6083 ) 6084 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6085 log.debug(f"SPiP annotations: {spip}") 6086 log.debug(f"SpliceAI annotations: {spliceai}") 6087 if spip and spliceai: 6088 return [ 6089 f"--spip_transcriptome {spip}", 6090 f"--spliceai_annotations {spliceai}", 6091 ] 6092 else: 6093 # TODO crash and go on with basic annotations ? 6094 # raise ValueError( 6095 # "Can't find splice databases in configuration EXIT" 6096 # ) 6097 log.warning( 6098 "Can't find splice databases in configuration, use annotations file from image" 6099 ) 6100 except TypeError: 6101 log.warning( 6102 "Can't find splice databases in configuration, use annotations file from image" 6103 ) 6104 return [] 6105 6106 # Add options, check if transcriptome option have already beend provided 6107 if ( 6108 "spip_transcriptome" not in nf_params 6109 and "spliceai_transcriptome" not in nf_params 6110 ): 6111 splice_reference = splice_annotations(options, config) 6112 if splice_reference: 6113 nf_params.extend(splice_reference) 6114 6115 nf_params.append(f"--output_folder {output_folder}") 6116 6117 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6118 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6119 log.debug(cmd) 6120 6121 splice_config["docker"]["command"] = cmd 6122 6123 docker_cmd = get_bin_command( 6124 tool="splice", 6125 bin_type="docker", 6126 config=config, 6127 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6128 add_options=f"--name {random_uuid} {' '.join(mount)}", 6129 ) 6130 6131 # Docker debug 6132 # if splice_config.get("rm_container"): 6133 # rm_container = "--rm" 6134 # else: 6135 # rm_container = "" 6136 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6137 6138 log.debug(docker_cmd) 6139 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6140 log.debug(res.stdout) 6141 if res.stderr: 6142 log.error(res.stderr) 6143 res.check_returncode() 6144 else: 6145 log.warning(f"Splice tool configuration not found: {config}") 6146 6147 # Update variants 6148 log.info("Annotation - Updating...") 6149 # Test find output vcf 6150 log.debug( 6151 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6152 ) 6153 output_vcf = [] 6154 # Wrong folder to look in 6155 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6156 if ( 6157 files 6158 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6159 ): 6160 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6161 # log.debug(os.listdir(options.get("output_folder"))) 6162 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6163 if not output_vcf: 6164 log.debug( 6165 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6166 ) 6167 else: 6168 # Get new header from annotated vcf 6169 log.debug(f"Initial header: {len(header.infos)} fields") 6170 # Create new header with splice infos 6171 new_vcf = Variants(input=output_vcf[0]) 6172 new_vcf_header = new_vcf.get_header().infos 6173 for keys, infos in new_vcf_header.items(): 6174 if keys not in header.infos.keys(): 6175 header.infos[keys] = infos 6176 log.debug(f"New header: {len(header.infos)} fields") 6177 log.debug(f"Splice tmp output: {output_vcf[0]}") 6178 self.update_from_vcf(output_vcf[0]) 6179 6180 # Remove folder 6181 remove_if_exists(output_folder) 6182 6183 ### 6184 # Prioritization 6185 ### 6186 6187 def get_config_default(self, name: str) -> dict: 6188 """ 6189 The function `get_config_default` returns a dictionary containing default configurations for 6190 various calculations and prioritizations. 6191 6192 :param name: The `get_config_default` function returns a dictionary containing default 6193 configurations for different calculations and prioritizations. The `name` parameter is used to 6194 specify which specific configuration to retrieve from the dictionary 6195 :type name: str 6196 :return: The function `get_config_default` returns a dictionary containing default configuration 6197 settings for different calculations and prioritizations. The specific configuration settings are 6198 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6199 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6200 returned. If there is no match, an empty dictionary is returned. 6201 """ 6202 6203 config_default = { 6204 "calculations": { 6205 "variant_chr_pos_alt_ref": { 6206 "type": "sql", 6207 "name": "variant_chr_pos_alt_ref", 6208 "description": "Create a variant ID with chromosome, position, alt and ref", 6209 "available": False, 6210 "output_column_name": "variant_chr_pos_alt_ref", 6211 "output_column_type": "String", 6212 "output_column_description": "variant ID with chromosome, position, alt and ref", 6213 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6214 "operation_info": True, 6215 }, 6216 "VARTYPE": { 6217 "type": "sql", 6218 "name": "VARTYPE", 6219 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6220 "available": True, 6221 "output_column_name": "VARTYPE", 6222 "output_column_type": "String", 6223 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6224 "operation_query": """ 6225 CASE 6226 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6227 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6228 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6229 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6230 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6231 ELSE 'UNDEFINED' 6232 END 6233 """, 6234 "info_fields": ["SVTYPE"], 6235 "operation_info": True, 6236 }, 6237 "snpeff_hgvs": { 6238 "type": "python", 6239 "name": "snpeff_hgvs", 6240 "description": "HGVS nomenclatures from snpEff annotation", 6241 "available": True, 6242 "function_name": "calculation_extract_snpeff_hgvs", 6243 "function_params": [], 6244 }, 6245 "NOMEN": { 6246 "type": "python", 6247 "name": "NOMEN", 6248 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field", 6249 "available": True, 6250 "function_name": "calculation_extract_nomen", 6251 "function_params": [], 6252 }, 6253 "FINDBYPIPELINE": { 6254 "type": "python", 6255 "name": "FINDBYPIPELINE", 6256 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6257 "available": True, 6258 "function_name": "calculation_find_by_pipeline", 6259 "function_params": ["findbypipeline"], 6260 }, 6261 "FINDBYSAMPLE": { 6262 "type": "python", 6263 "name": "FINDBYSAMPLE", 6264 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6265 "available": True, 6266 "function_name": "calculation_find_by_pipeline", 6267 "function_params": ["findbysample"], 6268 }, 6269 "GENOTYPECONCORDANCE": { 6270 "type": "python", 6271 "name": "GENOTYPECONCORDANCE", 6272 "description": "Concordance of genotype for multi caller VCF", 6273 "available": True, 6274 "function_name": "calculation_genotype_concordance", 6275 "function_params": [], 6276 }, 6277 "BARCODE": { 6278 "type": "python", 6279 "name": "BARCODE", 6280 "description": "BARCODE as VaRank tool", 6281 "available": True, 6282 "function_name": "calculation_barcode", 6283 "function_params": [], 6284 }, 6285 "BARCODEFAMILY": { 6286 "type": "python", 6287 "name": "BARCODEFAMILY", 6288 "description": "BARCODEFAMILY as VaRank tool", 6289 "available": True, 6290 "function_name": "calculation_barcode_family", 6291 "function_params": ["BCF"], 6292 }, 6293 "TRIO": { 6294 "type": "python", 6295 "name": "TRIO", 6296 "description": "Inheritance for a trio family", 6297 "available": True, 6298 "function_name": "calculation_trio", 6299 "function_params": [], 6300 }, 6301 "VAF": { 6302 "type": "python", 6303 "name": "VAF", 6304 "description": "Variant Allele Frequency (VAF) harmonization", 6305 "available": True, 6306 "function_name": "calculation_vaf_normalization", 6307 "function_params": [], 6308 }, 6309 "VAF_stats": { 6310 "type": "python", 6311 "name": "VAF_stats", 6312 "description": "Variant Allele Frequency (VAF) statistics", 6313 "available": True, 6314 "function_name": "calculation_genotype_stats", 6315 "function_params": ["VAF"], 6316 }, 6317 "DP_stats": { 6318 "type": "python", 6319 "name": "DP_stats", 6320 "description": "Depth (DP) statistics", 6321 "available": True, 6322 "function_name": "calculation_genotype_stats", 6323 "function_params": ["DP"], 6324 }, 6325 "variant_id": { 6326 "type": "python", 6327 "name": "variant_id", 6328 "description": "Variant ID generated from variant position and type", 6329 "available": True, 6330 "function_name": "calculation_variant_id", 6331 "function_params": [], 6332 }, 6333 }, 6334 "prioritizations": { 6335 "default": { 6336 "filter": [ 6337 { 6338 "type": "notequals", 6339 "value": "!PASS|\\.", 6340 "score": 0, 6341 "flag": "FILTERED", 6342 "comment": ["Bad variant quality"], 6343 }, 6344 { 6345 "type": "equals", 6346 "value": "REJECT", 6347 "score": -20, 6348 "flag": "PASS", 6349 "comment": ["Bad variant quality"], 6350 }, 6351 ], 6352 "DP": [ 6353 { 6354 "type": "gte", 6355 "value": "50", 6356 "score": 5, 6357 "flag": "PASS", 6358 "comment": ["DP higher than 50"], 6359 } 6360 ], 6361 "ANN": [ 6362 { 6363 "type": "contains", 6364 "value": "HIGH", 6365 "score": 5, 6366 "flag": "PASS", 6367 "comment": [ 6368 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6369 ], 6370 }, 6371 { 6372 "type": "contains", 6373 "value": "MODERATE", 6374 "score": 3, 6375 "flag": "PASS", 6376 "comment": [ 6377 "A non-disruptive variant that might change protein effectiveness" 6378 ], 6379 }, 6380 { 6381 "type": "contains", 6382 "value": "LOW", 6383 "score": 0, 6384 "flag": "FILTERED", 6385 "comment": [ 6386 "Assumed to be mostly harmless or unlikely to change protein behavior" 6387 ], 6388 }, 6389 { 6390 "type": "contains", 6391 "value": "MODIFIER", 6392 "score": 0, 6393 "flag": "FILTERED", 6394 "comment": [ 6395 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 6396 ], 6397 }, 6398 ], 6399 } 6400 }, 6401 } 6402 6403 return config_default.get(name, None) 6404 6405 def get_config_json( 6406 self, name: str, config_dict: dict = {}, config_file: str = None 6407 ) -> dict: 6408 """ 6409 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 6410 default values, a dictionary, and a file. 6411 6412 :param name: The `name` parameter in the `get_config_json` function is a string that represents 6413 the name of the configuration. It is used to identify and retrieve the configuration settings 6414 for a specific component or module 6415 :type name: str 6416 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 6417 dictionary that allows you to provide additional configuration settings or overrides. When you 6418 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 6419 the key is the configuration setting you want to override or 6420 :type config_dict: dict 6421 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 6422 specify the path to a configuration file that contains additional settings. If provided, the 6423 function will read the contents of this file and update the configuration dictionary with the 6424 values found in the file, overriding any existing values with the 6425 :type config_file: str 6426 :return: The function `get_config_json` returns a dictionary containing the configuration 6427 settings. 6428 """ 6429 6430 # Create with default prioritizations 6431 config_default = self.get_config_default(name=name) 6432 configuration = config_default 6433 # log.debug(f"configuration={configuration}") 6434 6435 # Replace prioritizations from dict 6436 for config in config_dict: 6437 configuration[config] = config_dict[config] 6438 6439 # Replace prioritizations from file 6440 config_file = full_path(config_file) 6441 if config_file: 6442 if os.path.exists(config_file): 6443 with open(config_file) as config_file_content: 6444 config_file_dict = json.load(config_file_content) 6445 for config in config_file_dict: 6446 configuration[config] = config_file_dict[config] 6447 else: 6448 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 6449 log.error(msg_error) 6450 raise ValueError(msg_error) 6451 6452 return configuration 6453 6454 # def get_prioritizations_config(self, prioritizations_config_dict:dict = {}, prioritizations_config_file:str = None) -> dict: 6455 6456 # # Create with default prioritizations 6457 # prioritizations_config = self.get_config_default("prioritization") 6458 6459 # # Replace prioritizations from dict 6460 # for prioritization_config in prioritizations_config_dict: 6461 # prioritizations_config[prioritization_config] = prioritizations_config_dict[prioritization_config] 6462 6463 # # Replace prioritizations from file 6464 # prioritizations_config_file = full_path(prioritizations_config_file) 6465 # if prioritizations_config_file: 6466 # if os.path.exists(prioritizations_config_file): 6467 # with open(prioritizations_config_file) as prioritizations_config_file_content: 6468 # prioritizations_config_file_dict = json.load(prioritizations_config_file_content) 6469 # for prioritization_config in prioritizations_config_file_dict: 6470 # prioritizations_config[prioritization_config] = prioritizations_config_file_dict[prioritization_config] 6471 # else: 6472 # log.error(f"Prioritizations config file '{prioritizations_config_file}' does NOT exist") 6473 # raise ValueError(f"Prioritizations config file '{prioritizations_config_file}' does NOT exist") 6474 6475 # return prioritizations_config 6476 6477 def prioritization(self) -> None: 6478 """ 6479 It takes a VCF file, and adds a bunch of new INFO fields to it, based on the values of other 6480 INFO fields 6481 """ 6482 6483 # Config 6484 config = self.get_config() 6485 6486 # Param 6487 param = self.get_param() 6488 6489 # Quick Prioritizations 6490 # prioritizations = param.get("prioritization", {}).get("prioritizations", "") 6491 6492 # Configuration profiles 6493 prioritization_config_file = param.get("prioritization", {}).get( 6494 "prioritization_config", None 6495 ) 6496 prioritization_config_file = full_path(prioritization_config_file) 6497 prioritizations_config = self.get_config_json( 6498 name="prioritizations", config_file=prioritization_config_file 6499 ) 6500 6501 # Prioritization options 6502 profiles = param.get("prioritization", {}).get("profiles", []) 6503 if isinstance(profiles, str): 6504 profiles = profiles.split(",") 6505 pzfields = param.get("prioritization", {}).get( 6506 "pzfields", ["PZFlag", "PZScore"] 6507 ) 6508 if isinstance(pzfields, str): 6509 pzfields = pzfields.split(",") 6510 default_profile = param.get("prioritization", {}).get("default_profile", None) 6511 pzfields_sep = param.get("prioritization", {}).get("pzfields_sep", "_") 6512 prioritization_score_mode = param.get("prioritization", {}).get( 6513 "prioritization_score_mode", "HOWARD" 6514 ) 6515 6516 # Quick Prioritizations 6517 # prioritizations = param.get("prioritization", {}).get("prioritizations", None) 6518 prioritizations = param.get("prioritizations", None) 6519 if prioritizations: 6520 log.info("Quick Prioritization:") 6521 for profile in prioritizations.split(","): 6522 if profile not in profiles: 6523 profiles.append(profile) 6524 log.info(f" {profile}") 6525 6526 # If profile "ALL" provided, all profiles in the config profiles 6527 if "ALL" in profiles: 6528 profiles = list(prioritizations_config.keys()) 6529 6530 for profile in profiles: 6531 if prioritizations_config.get(profile, None): 6532 log.debug(f"Profile '{profile}' configured") 6533 else: 6534 msg_error = f"Profile '{profile}' NOT configured" 6535 log.error(msg_error) 6536 raise ValueError(msg_error) 6537 6538 if profiles: 6539 log.info(f"Prioritization... ") 6540 else: 6541 log.debug(f"No profile defined") 6542 return 6543 6544 if not default_profile and len(profiles): 6545 default_profile = profiles[0] 6546 6547 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 6548 log.debug("Profiles to check: " + str(list(profiles))) 6549 6550 # Variables 6551 table_variants = self.get_table_variants(clause="update") 6552 6553 # Added columns 6554 added_columns = [] 6555 6556 # Create list of PZfields 6557 # List of PZFields 6558 list_of_pzfields_original = pzfields + [ 6559 pzfield + pzfields_sep + profile 6560 for pzfield in pzfields 6561 for profile in profiles 6562 ] 6563 list_of_pzfields = [] 6564 log.debug(f"{list_of_pzfields_original}") 6565 6566 # Remove existing PZfields to use if exists 6567 for pzfield in list_of_pzfields_original: 6568 if self.get_header().infos.get(pzfield, None) is None: 6569 list_of_pzfields.append(pzfield) 6570 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 6571 else: 6572 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 6573 6574 if list_of_pzfields: 6575 6576 # Explode Infos fields 6577 explode_infos_prefix = self.get_explode_infos_prefix() 6578 added_columns += self.explode_infos(prefix=explode_infos_prefix) 6579 extra_infos = self.get_extra_infos() 6580 6581 # PZfields tags description 6582 PZfields_INFOS = { 6583 "PZTags": { 6584 "ID": "PZTags", 6585 "Number": ".", 6586 "Type": "String", 6587 "Description": "Variant tags based on annotation criteria", 6588 }, 6589 "PZScore": { 6590 "ID": "PZScore", 6591 "Number": 1, 6592 "Type": "Integer", 6593 "Description": "Variant score based on annotation criteria", 6594 }, 6595 "PZFlag": { 6596 "ID": "PZFlag", 6597 "Number": 1, 6598 "Type": "String", 6599 "Description": "Variant flag based on annotation criteria", 6600 }, 6601 "PZComment": { 6602 "ID": "PZComment", 6603 "Number": ".", 6604 "Type": "String", 6605 "Description": "Variant comment based on annotation criteria", 6606 }, 6607 "PZInfos": { 6608 "ID": "PZInfos", 6609 "Number": ".", 6610 "Type": "String", 6611 "Description": "Variant infos based on annotation criteria", 6612 }, 6613 } 6614 6615 # Create INFO fields if not exist 6616 for field in PZfields_INFOS: 6617 field_ID = PZfields_INFOS[field]["ID"] 6618 field_description = PZfields_INFOS[field]["Description"] 6619 if field_ID not in self.get_header().infos and field_ID in pzfields: 6620 field_description = ( 6621 PZfields_INFOS[field]["Description"] 6622 + f", profile {default_profile}" 6623 ) 6624 self.get_header().infos[field_ID] = vcf.parser._Info( 6625 field_ID, 6626 PZfields_INFOS[field]["Number"], 6627 PZfields_INFOS[field]["Type"], 6628 field_description, 6629 "unknown", 6630 "unknown", 6631 code_type_map[PZfields_INFOS[field]["Type"]], 6632 ) 6633 6634 # Create INFO fields if not exist for each profile 6635 for profile in prioritizations_config: 6636 if profile in profiles or profiles == []: 6637 for field in PZfields_INFOS: 6638 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 6639 field_description = ( 6640 PZfields_INFOS[field]["Description"] 6641 + f", profile {profile}" 6642 ) 6643 if ( 6644 field_ID not in self.get_header().infos 6645 and field in pzfields 6646 ): 6647 self.get_header().infos[field_ID] = vcf.parser._Info( 6648 field_ID, 6649 PZfields_INFOS[field]["Number"], 6650 PZfields_INFOS[field]["Type"], 6651 field_description, 6652 "unknown", 6653 "unknown", 6654 code_type_map[PZfields_INFOS[field]["Type"]], 6655 ) 6656 6657 # Header 6658 for pzfield in list_of_pzfields: 6659 if re.match("PZScore.*", pzfield): 6660 added_column = self.add_column( 6661 table_name=table_variants, 6662 column_name=pzfield, 6663 column_type="INTEGER", 6664 default_value="0", 6665 ) 6666 elif re.match("PZFlag.*", pzfield): 6667 added_column = self.add_column( 6668 table_name=table_variants, 6669 column_name=pzfield, 6670 column_type="BOOLEAN", 6671 default_value="1", 6672 ) 6673 else: 6674 added_column = self.add_column( 6675 table_name=table_variants, 6676 column_name=pzfield, 6677 column_type="STRING", 6678 default_value="''", 6679 ) 6680 added_columns.append(added_column) 6681 6682 # Profiles 6683 if profiles: 6684 6685 # foreach profile in configuration file 6686 for profile in prioritizations_config: 6687 6688 # If profile is asked in param, or ALL are asked (empty profile []) 6689 if profile in profiles or profiles == []: 6690 log.info(f"Profile '{profile}'") 6691 6692 sql_set_info_option = "" 6693 6694 sql_set_info = [] 6695 6696 # PZ fields set 6697 6698 # PZScore 6699 if f"PZScore{pzfields_sep}{profile}" in list_of_pzfields: 6700 sql_set_info.append( 6701 f""" 6702 concat( 6703 'PZScore{pzfields_sep}{profile}=', 6704 PZScore{pzfields_sep}{profile} 6705 ) 6706 """ 6707 ) 6708 if ( 6709 profile == default_profile 6710 and "PZScore" in list_of_pzfields 6711 ): 6712 sql_set_info.append( 6713 f""" 6714 concat( 6715 'PZScore=', 6716 PZScore{pzfields_sep}{profile} 6717 ) 6718 """ 6719 ) 6720 6721 # PZFlag 6722 if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields: 6723 sql_set_info.append( 6724 f""" 6725 concat( 6726 'PZFlag{pzfields_sep}{profile}=', 6727 CASE 6728 WHEN PZFlag{pzfields_sep}{profile}==1 6729 THEN 'PASS' 6730 WHEN PZFlag{pzfields_sep}{profile}==0 6731 THEN 'FILTERED' 6732 END 6733 ) 6734 """ 6735 ) 6736 if ( 6737 profile == default_profile 6738 and "PZFlag" in list_of_pzfields 6739 ): 6740 sql_set_info.append( 6741 f""" 6742 concat( 6743 'PZFlag=', 6744 CASE 6745 WHEN PZFlag{pzfields_sep}{profile}==1 6746 THEN 'PASS' 6747 WHEN PZFlag{pzfields_sep}{profile}==0 6748 THEN 'FILTERED' 6749 END 6750 ) 6751 """ 6752 ) 6753 6754 # PZComment 6755 if f"PZComment{pzfields_sep}{profile}" in list_of_pzfields: 6756 sql_set_info.append( 6757 f""" 6758 CASE 6759 WHEN PZComment{pzfields_sep}{profile} NOT IN ('') 6760 THEN concat('PZComment{pzfields_sep}{profile}=', PZComment{pzfields_sep}{profile}) 6761 ELSE '' 6762 END 6763 """ 6764 ) 6765 if ( 6766 profile == default_profile 6767 and "PZComment" in list_of_pzfields 6768 ): 6769 sql_set_info.append( 6770 f""" 6771 CASE 6772 WHEN PZComment{pzfields_sep}{profile} NOT IN ('') 6773 THEN concat('PZComment=', PZComment{pzfields_sep}{profile}) 6774 ELSE '' 6775 END 6776 """ 6777 ) 6778 6779 # PZInfos 6780 if f"PZInfos{pzfields_sep}{profile}" in list_of_pzfields: 6781 sql_set_info.append( 6782 f""" 6783 CASE 6784 WHEN PZInfos{pzfields_sep}{profile} NOT IN ('') 6785 THEN concat('PZInfos{pzfields_sep}{profile}=', PZInfos{pzfields_sep}{profile}) 6786 ELSE '' 6787 END 6788 """ 6789 ) 6790 if ( 6791 profile == default_profile 6792 and "PZInfos" in list_of_pzfields 6793 ): 6794 sql_set_info.append( 6795 f""" 6796 CASE 6797 WHEN PZInfos{pzfields_sep}{profile} NOT IN ('') 6798 THEN concat('PZInfos=', PZInfos{pzfields_sep}{profile}) 6799 ELSE '' 6800 END 6801 """ 6802 ) 6803 6804 # Merge PZfields 6805 sql_set_info_option = "" 6806 sql_set_sep = "" 6807 for sql_set in sql_set_info: 6808 if sql_set_sep: 6809 sql_set_info_option += f""" 6810 , concat('{sql_set_sep}', {sql_set}) 6811 """ 6812 else: 6813 sql_set_info_option += f""" 6814 , {sql_set} 6815 """ 6816 sql_set_sep = ";" 6817 6818 sql_queries = [] 6819 for annotation in prioritizations_config[profile]: 6820 6821 # Check if annotation field is present 6822 if not f"{explode_infos_prefix}{annotation}" in extra_infos: 6823 log.debug(f"Annotation '{annotation}' not in data") 6824 continue 6825 else: 6826 log.debug(f"Annotation '{annotation}' in data") 6827 6828 # For each criterions 6829 for criterion in prioritizations_config[profile][ 6830 annotation 6831 ]: 6832 criterion_type = criterion["type"] 6833 criterion_value = criterion["value"] 6834 criterion_score = criterion.get("score", 0) 6835 criterion_flag = criterion.get("flag", "PASS") 6836 criterion_flag_bool = criterion_flag == "PASS" 6837 criterion_comment = ( 6838 ", ".join(criterion.get("comment", [])) 6839 .replace("'", "''") 6840 .replace(";", ",") 6841 .replace("\t", " ") 6842 ) 6843 criterion_infos = ( 6844 str(criterion) 6845 .replace("'", "''") 6846 .replace(";", ",") 6847 .replace("\t", " ") 6848 ) 6849 6850 sql_set = [] 6851 sql_set_info = [] 6852 6853 # PZ fields set 6854 if ( 6855 f"PZScore{pzfields_sep}{profile}" 6856 in list_of_pzfields 6857 ): 6858 if prioritization_score_mode == "HOWARD": 6859 sql_set.append( 6860 f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}" 6861 ) 6862 elif prioritization_score_mode == "VaRank": 6863 sql_set.append( 6864 f"PZScore{pzfields_sep}{profile} = CASE WHEN {criterion_score}>PZScore{pzfields_sep}{profile} THEN {criterion_score} END" 6865 ) 6866 else: 6867 sql_set.append( 6868 f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}" 6869 ) 6870 if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields: 6871 sql_set.append( 6872 f"PZFlag{pzfields_sep}{profile} = PZFlag{pzfields_sep}{profile} AND {criterion_flag_bool}" 6873 ) 6874 if ( 6875 f"PZComment{pzfields_sep}{profile}" 6876 in list_of_pzfields 6877 ): 6878 sql_set.append( 6879 f""" 6880 PZComment{pzfields_sep}{profile} = 6881 concat( 6882 PZComment{pzfields_sep}{profile}, 6883 CASE 6884 WHEN PZComment{pzfields_sep}{profile}!='' 6885 THEN ', ' 6886 ELSE '' 6887 END, 6888 '{criterion_comment}' 6889 ) 6890 """ 6891 ) 6892 if ( 6893 f"PZInfos{pzfields_sep}{profile}" 6894 in list_of_pzfields 6895 ): 6896 sql_set.append( 6897 f""" 6898 PZInfos{pzfields_sep}{profile} = 6899 concat( 6900 PZInfos{pzfields_sep}{profile}, 6901 '{criterion_infos}' 6902 ) 6903 """ 6904 ) 6905 sql_set_option = ",".join(sql_set) 6906 6907 # Criterion and comparison 6908 try: 6909 float(criterion_value) 6910 sql_update = f""" 6911 UPDATE {table_variants} 6912 SET {sql_set_option} 6913 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 6914 AND "{explode_infos_prefix}{annotation}"{comparison_map[criterion_type]}{criterion_value} 6915 """ 6916 except: 6917 contains_option = "" 6918 if criterion_type == "contains": 6919 contains_option = ".*" 6920 sql_update = f""" 6921 UPDATE {table_variants} 6922 SET {sql_set_option} 6923 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 6924 """ 6925 sql_queries.append(sql_update) 6926 6927 # PZTags 6928 if f"PZTags{pzfields_sep}{profile}" in list_of_pzfields: 6929 6930 # Create PZFalgs value 6931 pztags_value = "" 6932 pztags_sep_default = "|" 6933 pztags_sep = "" 6934 for pzfield in pzfields: 6935 if pzfield not in ["PZTags"]: 6936 if ( 6937 f"{pzfield}{pzfields_sep}{profile}" 6938 in list_of_pzfields 6939 ): 6940 if pzfield in ["PZFlag"]: 6941 pztags_value += f"""{pztags_sep}{pzfield}#', 6942 CASE WHEN PZFlag{pzfields_sep}{profile} 6943 THEN 'PASS' 6944 ELSE 'FILTERED' 6945 END, '""" 6946 else: 6947 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 6948 pztags_sep = pztags_sep_default 6949 6950 # Add Query update for PZFlags 6951 sql_update_pztags = f""" 6952 UPDATE {table_variants} 6953 SET INFO = concat( 6954 INFO, 6955 CASE WHEN INFO NOT in ('','.') 6956 THEN ';' 6957 ELSE '' 6958 END, 6959 'PZTags{pzfields_sep}{profile}={pztags_value}' 6960 ) 6961 """ 6962 sql_queries.append(sql_update_pztags) 6963 6964 # Add Query update for PZFlags for default 6965 if profile == default_profile: 6966 sql_update_pztags_default = f""" 6967 UPDATE {table_variants} 6968 SET INFO = concat( 6969 INFO, 6970 ';', 6971 'PZTags={pztags_value}' 6972 ) 6973 """ 6974 sql_queries.append(sql_update_pztags_default) 6975 6976 log.info(f"""Profile '{profile}' - Prioritization... """) 6977 6978 if sql_queries: 6979 6980 for sql_query in sql_queries: 6981 log.debug( 6982 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 6983 ) 6984 self.conn.execute(sql_query) 6985 6986 log.info(f"""Profile '{profile}' - Update... """) 6987 sql_query_update = f""" 6988 UPDATE {table_variants} 6989 SET INFO = 6990 concat( 6991 CASE 6992 WHEN INFO NOT IN ('','.') 6993 THEN concat(INFO, ';') 6994 ELSE '' 6995 END 6996 {sql_set_info_option} 6997 ) 6998 """ 6999 self.conn.execute(sql_query_update) 7000 7001 else: 7002 7003 log.warning(f"No profiles in parameters") 7004 7005 # Remove added columns 7006 for added_column in added_columns: 7007 self.drop_column(column=added_column) 7008 7009 # Explode INFOS fields into table fields 7010 if self.get_explode_infos(): 7011 self.explode_infos( 7012 prefix=self.get_explode_infos_prefix(), 7013 fields=self.get_explode_infos_fields(), 7014 force=True, 7015 ) 7016 7017 return 7018 7019 ### 7020 # HGVS 7021 ### 7022 7023 def annotation_hgvs(self, threads: int = None) -> None: 7024 """ 7025 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7026 coordinates and alleles. 7027 7028 :param threads: The `threads` parameter is an optional integer that specifies the number of 7029 threads to use for parallel processing. If no value is provided, it will default to the number 7030 of threads obtained from the `get_threads()` method 7031 :type threads: int 7032 """ 7033 7034 # Function for each partition of the Dask Dataframe 7035 def partition_function(partition): 7036 """ 7037 The function `partition_function` applies the `annotation_hgvs_partition` function to 7038 each row of a DataFrame called `partition`. 7039 7040 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7041 to be processed 7042 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7043 the "partition" dataframe along the axis 1. 7044 """ 7045 return partition.apply(annotation_hgvs_partition, axis=1) 7046 7047 def annotation_hgvs_partition(row) -> str: 7048 """ 7049 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7050 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7051 7052 :param row: A dictionary-like object that contains the values for the following keys: 7053 :return: a string that contains the HGVS names associated with the given row of data. 7054 """ 7055 7056 chr = row["CHROM"] 7057 pos = row["POS"] 7058 ref = row["REF"] 7059 alt = row["ALT"] 7060 7061 # Find list of associated transcripts 7062 transcripts_list = list( 7063 polars_conn.execute( 7064 f""" 7065 SELECT transcript 7066 FROM refseq_df 7067 WHERE CHROM='{chr}' 7068 AND POS={pos} 7069 """ 7070 )["transcript"] 7071 ) 7072 7073 # Full HGVS annotation in list 7074 hgvs_full_list = [] 7075 7076 for transcript_name in transcripts_list: 7077 7078 # Transcript 7079 transcript = get_transcript( 7080 transcripts=transcripts, transcript_name=transcript_name 7081 ) 7082 # Exon 7083 if use_exon: 7084 exon = transcript.find_exon_number(pos) 7085 else: 7086 exon = None 7087 # Protein 7088 transcript_protein = None 7089 if use_protein or add_protein or full_format: 7090 transcripts_protein = list( 7091 polars_conn.execute( 7092 f""" 7093 SELECT protein 7094 FROM refseqlink_df 7095 WHERE transcript='{transcript_name}' 7096 LIMIT 1 7097 """ 7098 )["protein"] 7099 ) 7100 if len(transcripts_protein): 7101 transcript_protein = transcripts_protein[0] 7102 7103 # HGVS name 7104 hgvs_name = format_hgvs_name( 7105 chr, 7106 pos, 7107 ref, 7108 alt, 7109 genome=genome, 7110 transcript=transcript, 7111 transcript_protein=transcript_protein, 7112 exon=exon, 7113 use_gene=use_gene, 7114 use_protein=use_protein, 7115 full_format=full_format, 7116 use_version=use_version, 7117 codon_type=codon_type, 7118 ) 7119 hgvs_full_list.append(hgvs_name) 7120 if add_protein and not use_protein and not full_format: 7121 hgvs_name = format_hgvs_name( 7122 chr, 7123 pos, 7124 ref, 7125 alt, 7126 genome=genome, 7127 transcript=transcript, 7128 transcript_protein=transcript_protein, 7129 exon=exon, 7130 use_gene=use_gene, 7131 use_protein=True, 7132 full_format=False, 7133 use_version=use_version, 7134 codon_type=codon_type, 7135 ) 7136 hgvs_full_list.append(hgvs_name) 7137 7138 # Create liste of HGVS annotations 7139 hgvs_full = ",".join(hgvs_full_list) 7140 7141 return hgvs_full 7142 7143 # Polars connexion 7144 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7145 7146 # Config 7147 config = self.get_config() 7148 7149 # Databases 7150 # Genome 7151 databases_genomes_folders = ( 7152 config.get("folders", {}) 7153 .get("databases", {}) 7154 .get("genomes", DEFAULT_GENOME_FOLDER) 7155 ) 7156 databases_genome = ( 7157 config.get("folders", {}).get("databases", {}).get("genomes", "") 7158 ) 7159 # refseq database folder 7160 databases_refseq_folders = ( 7161 config.get("folders", {}) 7162 .get("databases", {}) 7163 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7164 ) 7165 # refseq 7166 databases_refseq = config.get("databases", {}).get("refSeq", None) 7167 # refSeqLink 7168 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7169 7170 # Param 7171 param = self.get_param() 7172 7173 # Quick HGVS 7174 if "hgvs_options" in param and param.get("hgvs_options", ""): 7175 log.info(f"Quick HGVS Annotation:") 7176 if not param.get("hgvs", None): 7177 param["hgvs"] = {} 7178 for option in param.get("hgvs_options", "").split(","): 7179 option_var_val = option.split("=") 7180 option_var = option_var_val[0] 7181 if len(option_var_val) > 1: 7182 option_val = option_var_val[1] 7183 else: 7184 option_val = "True" 7185 if option_val.upper() in ["TRUE"]: 7186 option_val = True 7187 elif option_val.upper() in ["FALSE"]: 7188 option_val = False 7189 log.info(f" {option_var}={option_val}") 7190 param["hgvs"][option_var] = option_val 7191 7192 # Check if HGVS annotation enabled 7193 if "hgvs" in param: 7194 log.info(f"HGVS Annotation... ") 7195 for hgvs_option in param.get("hgvs", {}): 7196 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7197 else: 7198 return 7199 7200 # HGVS Param 7201 param_hgvs = param.get("hgvs", {}) 7202 use_exon = param_hgvs.get("use_exon", False) 7203 use_gene = param_hgvs.get("use_gene", False) 7204 use_protein = param_hgvs.get("use_protein", False) 7205 add_protein = param_hgvs.get("add_protein", False) 7206 full_format = param_hgvs.get("full_format", False) 7207 use_version = param_hgvs.get("use_version", False) 7208 codon_type = param_hgvs.get("codon_type", "3") 7209 7210 # refSseq refSeqLink 7211 databases_refseq = param_hgvs.get("refseq", databases_refseq) 7212 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 7213 7214 # Assembly 7215 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 7216 7217 # Genome 7218 genome_file = None 7219 if find_genome(databases_genome): 7220 genome_file = find_genome(databases_genome) 7221 else: 7222 genome_file = find_genome( 7223 genome_path=databases_genomes_folders, assembly=assembly 7224 ) 7225 log.debug("Genome: " + str(genome_file)) 7226 7227 # refSseq 7228 refseq_file = find_file_prefix( 7229 input_file=databases_refseq, 7230 prefix="ncbiRefSeq", 7231 folder=databases_refseq_folders, 7232 assembly=assembly, 7233 ) 7234 log.debug("refSeq: " + str(refseq_file)) 7235 7236 # refSeqLink 7237 refseqlink_file = find_file_prefix( 7238 input_file=databases_refseqlink, 7239 prefix="ncbiRefSeqLink", 7240 folder=databases_refseq_folders, 7241 assembly=assembly, 7242 ) 7243 log.debug("refSeqLink: " + str(refseqlink_file)) 7244 7245 # Threads 7246 if not threads: 7247 threads = self.get_threads() 7248 log.debug("Threads: " + str(threads)) 7249 7250 # Variables 7251 table_variants = self.get_table_variants(clause="update") 7252 7253 # Get variants SNV and InDel only 7254 query_variants = f""" 7255 SELECT "#CHROM" AS CHROM, POS, REF, ALT 7256 FROM {table_variants} 7257 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 7258 """ 7259 df_variants = self.get_query_to_df(query_variants) 7260 7261 # Added columns 7262 added_columns = [] 7263 7264 # Add hgvs column in variants table 7265 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 7266 added_column = self.add_column( 7267 table_variants, hgvs_column_name, "STRING", default_value=None 7268 ) 7269 added_columns.append(added_column) 7270 7271 log.debug(f"refSeq loading...") 7272 # refSeq in duckDB 7273 refseq_table = get_refseq_table( 7274 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 7275 ) 7276 # Loading all refSeq in Dataframe 7277 refseq_query = f""" 7278 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 7279 FROM {refseq_table} 7280 JOIN df_variants ON ( 7281 {refseq_table}.chrom = df_variants.CHROM 7282 AND {refseq_table}.txStart<=df_variants.POS 7283 AND {refseq_table}.txEnd>=df_variants.POS 7284 ) 7285 """ 7286 refseq_df = self.conn.query(refseq_query).pl() 7287 7288 if refseqlink_file: 7289 log.debug(f"refSeqLink loading...") 7290 # refSeqLink in duckDB 7291 refseqlink_table = get_refseq_table( 7292 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 7293 ) 7294 # Loading all refSeqLink in Dataframe 7295 protacc_column = "protAcc_with_ver" 7296 mrnaacc_column = "mrnaAcc_with_ver" 7297 refseqlink_query = f""" 7298 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 7299 FROM {refseqlink_table} 7300 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 7301 WHERE protAcc_without_ver IS NOT NULL 7302 """ 7303 # Polars Dataframe 7304 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 7305 7306 # Read RefSeq transcripts into a python dict/model. 7307 log.debug(f"Transcripts loading...") 7308 with tempfile.TemporaryDirectory() as tmpdir: 7309 transcripts_query = f""" 7310 COPY ( 7311 SELECT {refseq_table}.* 7312 FROM {refseq_table} 7313 JOIN df_variants ON ( 7314 {refseq_table}.chrom=df_variants.CHROM 7315 AND {refseq_table}.txStart<=df_variants.POS 7316 AND {refseq_table}.txEnd>=df_variants.POS 7317 ) 7318 ) 7319 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 7320 """ 7321 self.conn.query(transcripts_query) 7322 with open(f"{tmpdir}/transcript.tsv") as infile: 7323 transcripts = read_transcripts(infile) 7324 7325 # Polars connexion 7326 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7327 7328 log.debug("Genome loading...") 7329 # Read genome sequence using pyfaidx. 7330 genome = Fasta(genome_file) 7331 7332 log.debug("Start annotation HGVS...") 7333 7334 # Create 7335 # a Dask Dataframe from Pandas dataframe with partition as number of threads 7336 ddf = dd.from_pandas(df_variants, npartitions=threads) 7337 7338 # Use dask.dataframe.apply() to apply function on each partition 7339 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 7340 7341 # Convert Dask DataFrame to Pandas Dataframe 7342 df = ddf.compute() 7343 7344 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 7345 with tempfile.TemporaryDirectory() as tmpdir: 7346 df_parquet = os.path.join(tmpdir, "df.parquet") 7347 df.to_parquet(df_parquet) 7348 7349 # Update hgvs column 7350 update_variant_query = f""" 7351 UPDATE {table_variants} 7352 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 7353 FROM read_parquet('{df_parquet}') as df 7354 WHERE variants."#CHROM" = df.CHROM 7355 AND variants.POS = df.POS 7356 AND variants.REF = df.REF 7357 AND variants.ALT = df.ALT 7358 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 7359 """ 7360 self.execute_query(update_variant_query) 7361 7362 # Update INFO column 7363 sql_query_update = f""" 7364 UPDATE {table_variants} 7365 SET INFO = 7366 concat( 7367 CASE 7368 WHEN INFO NOT IN ('','.') 7369 THEN concat(INFO, ';') 7370 ELSE '' 7371 END, 7372 'hgvs=', 7373 {hgvs_column_name} 7374 ) 7375 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 7376 """ 7377 self.execute_query(sql_query_update) 7378 7379 # Add header 7380 HGVS_INFOS = { 7381 "hgvs": { 7382 "ID": "hgvs", 7383 "Number": ".", 7384 "Type": "String", 7385 "Description": f"HGVS annotatation with HOWARD", 7386 } 7387 } 7388 7389 for field in HGVS_INFOS: 7390 field_ID = HGVS_INFOS[field]["ID"] 7391 field_description = HGVS_INFOS[field]["Description"] 7392 self.get_header().infos[field_ID] = vcf.parser._Info( 7393 field_ID, 7394 HGVS_INFOS[field]["Number"], 7395 HGVS_INFOS[field]["Type"], 7396 field_description, 7397 "unknown", 7398 "unknown", 7399 code_type_map[HGVS_INFOS[field]["Type"]], 7400 ) 7401 7402 # Remove added columns 7403 for added_column in added_columns: 7404 self.drop_column(column=added_column) 7405 7406 ### 7407 # Calculation 7408 ### 7409 7410 def get_operations_help( 7411 self, operations_config_dict: dict = {}, operations_config_file: str = None 7412 ) -> list: 7413 7414 # Init 7415 operations_help = [] 7416 7417 # operations 7418 operations = self.get_config_json( 7419 name="calculations", 7420 config_dict=operations_config_dict, 7421 config_file=operations_config_file, 7422 ) 7423 for op in operations: 7424 op_name = operations[op].get("name", op).upper() 7425 op_description = operations[op].get("description", op_name) 7426 op_available = operations[op].get("available", False) 7427 if op_available: 7428 operations_help.append(f" {op_name}: {op_description}") 7429 7430 # Sort operations 7431 operations_help.sort() 7432 7433 # insert header 7434 operations_help.insert(0, "Available calculation operations:") 7435 7436 # Return 7437 return operations_help 7438 7439 def calculation( 7440 self, 7441 operations: dict = {}, 7442 operations_config_dict: dict = {}, 7443 operations_config_file: str = None, 7444 ) -> None: 7445 """ 7446 It takes a list of operations, and for each operation, it checks if it's a python or sql 7447 operation, and then calls the appropriate function 7448 7449 param json example: 7450 "calculation": { 7451 "NOMEN": { 7452 "options": { 7453 "hgvs_field": "hgvs" 7454 }, 7455 "middle" : null 7456 } 7457 """ 7458 7459 # Param 7460 param = self.get_param() 7461 7462 # operations config 7463 operations_config = self.get_config_json( 7464 name="calculations", 7465 config_dict=operations_config_dict, 7466 config_file=operations_config_file, 7467 ) 7468 7469 # Upper keys 7470 operations_config = {k.upper(): v for k, v in operations_config.items()} 7471 7472 # Calculations 7473 7474 # Operations from param 7475 operations = param.get("calculation", {}).get("calculations", operations) 7476 7477 # Quick calculation - add 7478 if param.get("calculations", None): 7479 calculations_list = [ 7480 value for value in param.get("calculations", "").split(",") 7481 ] 7482 log.info(f"Quick Calculations:") 7483 for calculation_key in calculations_list: 7484 log.info(f" {calculation_key}") 7485 for calculation_operation in calculations_list: 7486 if calculation_operation.upper() not in operations: 7487 operations[calculation_operation.upper()] = {} 7488 add_value_into_dict( 7489 dict_tree=param, 7490 sections=[ 7491 "calculation", 7492 "calculations", 7493 calculation_operation.upper(), 7494 ], 7495 value={}, 7496 ) 7497 7498 # Operations for calculation 7499 if not operations: 7500 operations = param.get("calculation", {}).get("calculations", {}) 7501 7502 if operations: 7503 log.info(f"Calculations...") 7504 7505 # For each operations 7506 for operation_name in operations: 7507 operation_name = operation_name.upper() 7508 if operation_name not in [""]: 7509 if operation_name in operations_config: 7510 log.info(f"Calculation '{operation_name}'") 7511 operation = operations_config[operation_name] 7512 operation_type = operation.get("type", "sql") 7513 if operation_type == "python": 7514 self.calculation_process_function( 7515 operation=operation, operation_name=operation_name 7516 ) 7517 elif operation_type == "sql": 7518 self.calculation_process_sql( 7519 operation=operation, operation_name=operation_name 7520 ) 7521 else: 7522 log.error( 7523 f"Operations config: Type '{operation_type}' NOT available" 7524 ) 7525 raise ValueError( 7526 f"Operations config: Type '{operation_type}' NOT available" 7527 ) 7528 else: 7529 log.error( 7530 f"Operations config: Calculation '{operation_name}' NOT available" 7531 ) 7532 raise ValueError( 7533 f"Operations config: Calculation '{operation_name}' NOT available" 7534 ) 7535 7536 # Explode INFOS fields into table fields 7537 if self.get_explode_infos(): 7538 self.explode_infos( 7539 prefix=self.get_explode_infos_prefix(), 7540 fields=self.get_explode_infos_fields(), 7541 force=True, 7542 ) 7543 7544 def calculation_process_sql( 7545 self, operation: dict, operation_name: str = "unknown" 7546 ) -> None: 7547 """ 7548 The `calculation_process_sql` function takes in a mathematical operation as a string and 7549 performs the operation, updating the specified table with the result. 7550 7551 :param operation: The `operation` parameter is a dictionary that contains information about the 7552 mathematical operation to be performed. It includes the following keys: 7553 :type operation: dict 7554 :param operation_name: The `operation_name` parameter is a string that represents the name of 7555 the mathematical operation being performed. It is used for logging and error handling purposes, 7556 defaults to unknown 7557 :type operation_name: str (optional) 7558 """ 7559 7560 # table variants 7561 table_variants = self.get_table_variants(clause="alter") 7562 7563 # Operation infos 7564 operation_name = operation.get("name", "unknown") 7565 log.debug(f"process sql {operation_name}") 7566 output_column_name = operation.get("output_column_name", operation_name) 7567 output_column_type = operation.get("output_column_type", "String") 7568 prefix = operation.get("explode_infos_prefix", "") 7569 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 7570 output_column_description = operation.get( 7571 "output_column_description", f"{operation_name} operation" 7572 ) 7573 operation_query = operation.get("operation_query", None) 7574 if isinstance(operation_query, list): 7575 operation_query = " ".join(operation_query) 7576 operation_info_fields = operation.get("info_fields", []) 7577 operation_info_fields_check = operation.get("info_fields_check", False) 7578 operation_info = operation.get("operation_info", True) 7579 7580 if operation_query: 7581 7582 # Info fields check 7583 operation_info_fields_check_result = True 7584 if operation_info_fields_check: 7585 header_infos = self.get_header().infos 7586 for info_field in operation_info_fields: 7587 operation_info_fields_check_result = ( 7588 operation_info_fields_check_result 7589 and info_field in header_infos 7590 ) 7591 7592 # If info fields available 7593 if operation_info_fields_check_result: 7594 7595 # Added_columns 7596 added_columns = [] 7597 7598 # Create VCF header field 7599 vcf_reader = self.get_header() 7600 vcf_reader.infos[output_column_name] = vcf.parser._Info( 7601 output_column_name, 7602 ".", 7603 output_column_type, 7604 output_column_description, 7605 "howard calculation", 7606 "0", 7607 self.code_type_map.get(output_column_type), 7608 ) 7609 7610 # Explode infos if needed 7611 log.debug(f"calculation_process_sql prefix {prefix}") 7612 added_columns += self.explode_infos( 7613 prefix=prefix, 7614 fields=[output_column_name] + operation_info_fields, 7615 force=True, 7616 ) 7617 7618 # Create column 7619 added_column = self.add_column( 7620 table_name=table_variants, 7621 column_name=prefix + output_column_name, 7622 column_type=output_column_type_sql, 7623 default_value="null", 7624 ) 7625 added_columns.append(added_column) 7626 7627 # Operation calculation 7628 try: 7629 7630 # Query to update calculation column 7631 sql_update = f""" 7632 UPDATE {table_variants} 7633 SET "{prefix}{output_column_name}" = ({operation_query}) 7634 """ 7635 self.conn.execute(sql_update) 7636 7637 # Add to INFO 7638 if operation_info: 7639 sql_update_info = f""" 7640 UPDATE {table_variants} 7641 SET "INFO" = 7642 concat( 7643 CASE 7644 WHEN "INFO" IS NOT NULL 7645 THEN concat("INFO", ';') 7646 ELSE '' 7647 END, 7648 '{output_column_name}=', 7649 "{prefix}{output_column_name}" 7650 ) 7651 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 7652 """ 7653 self.conn.execute(sql_update_info) 7654 7655 except: 7656 log.error( 7657 f"Operations config: Calculation '{operation_name}' query failed" 7658 ) 7659 raise ValueError( 7660 f"Operations config: Calculation '{operation_name}' query failed" 7661 ) 7662 7663 # Remove added columns 7664 for added_column in added_columns: 7665 log.debug(f"added_column: {added_column}") 7666 self.drop_column(column=added_column) 7667 7668 else: 7669 log.error( 7670 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7671 ) 7672 raise ValueError( 7673 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7674 ) 7675 7676 else: 7677 log.error( 7678 f"Operations config: Calculation '{operation_name}' query NOT defined" 7679 ) 7680 raise ValueError( 7681 f"Operations config: Calculation '{operation_name}' query NOT defined" 7682 ) 7683 7684 def calculation_process_function( 7685 self, operation: dict, operation_name: str = "unknown" 7686 ) -> None: 7687 """ 7688 The `calculation_process_function` takes in an operation dictionary and performs the specified 7689 function with the given parameters. 7690 7691 :param operation: The `operation` parameter is a dictionary that contains information about the 7692 operation to be performed. It has the following keys: 7693 :type operation: dict 7694 :param operation_name: The `operation_name` parameter is a string that represents the name of 7695 the operation being performed. It is used for logging purposes, defaults to unknown 7696 :type operation_name: str (optional) 7697 """ 7698 7699 operation_name = operation["name"] 7700 log.debug(f"process sql {operation_name}") 7701 function_name = operation["function_name"] 7702 function_params = operation["function_params"] 7703 getattr(self, function_name)(*function_params) 7704 7705 def calculation_variant_id(self) -> None: 7706 """ 7707 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 7708 updates the INFO field of a variants table with the variant ID. 7709 """ 7710 7711 # variant_id annotation field 7712 variant_id_tag = self.get_variant_id_column() 7713 added_columns = [variant_id_tag] 7714 7715 # variant_id hgvs tags" 7716 vcf_infos_tags = { 7717 variant_id_tag: "howard variant ID annotation", 7718 } 7719 7720 # Variants table 7721 table_variants = self.get_table_variants() 7722 7723 # Header 7724 vcf_reader = self.get_header() 7725 7726 # Add variant_id to header 7727 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 7728 variant_id_tag, 7729 ".", 7730 "String", 7731 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 7732 "howard calculation", 7733 "0", 7734 self.code_type_map.get("String"), 7735 ) 7736 7737 # Update 7738 sql_update = f""" 7739 UPDATE {table_variants} 7740 SET "INFO" = 7741 concat( 7742 CASE 7743 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 7744 THEN '' 7745 ELSE concat("INFO", ';') 7746 END, 7747 '{variant_id_tag}=', 7748 "{variant_id_tag}" 7749 ) 7750 """ 7751 self.conn.execute(sql_update) 7752 7753 # Remove added columns 7754 for added_column in added_columns: 7755 self.drop_column(column=added_column) 7756 7757 def calculation_extract_snpeff_hgvs(self) -> None: 7758 """ 7759 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 7760 annotation field in a VCF file and adds them as a new column in the variants table. 7761 """ 7762 7763 # SnpEff annotation field 7764 snpeff_ann = "ANN" 7765 7766 # SnpEff annotation field 7767 snpeff_hgvs = "snpeff_hgvs" 7768 7769 # Snpeff hgvs tags 7770 vcf_infos_tags = { 7771 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 7772 } 7773 7774 # Prefix 7775 prefix = self.get_explode_infos_prefix() 7776 if prefix: 7777 prefix = "INFO/" 7778 7779 # snpEff fields 7780 speff_ann_infos = prefix + snpeff_ann 7781 speff_hgvs_infos = prefix + snpeff_hgvs 7782 7783 # Variants table 7784 table_variants = self.get_table_variants() 7785 7786 # Header 7787 vcf_reader = self.get_header() 7788 7789 # Add columns 7790 added_columns = [] 7791 7792 # Explode HGVS field in column 7793 added_columns += self.explode_infos(fields=[snpeff_ann]) 7794 7795 if "ANN" in vcf_reader.infos: 7796 7797 log.debug(vcf_reader.infos["ANN"]) 7798 7799 # Create variant id 7800 variant_id_column = self.get_variant_id_column() 7801 added_columns += [variant_id_column] 7802 7803 # Create dataframe 7804 dataframe_snpeff_hgvs = self.get_query_to_df( 7805 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 7806 ) 7807 7808 # Create main NOMEN column 7809 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 7810 speff_ann_infos 7811 ].apply(lambda x: extract_snpeff_hgvs(str(x))) 7812 7813 # Add snpeff_hgvs to header 7814 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 7815 snpeff_hgvs, 7816 ".", 7817 "String", 7818 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 7819 "howard calculation", 7820 "0", 7821 self.code_type_map.get("String"), 7822 ) 7823 7824 # Update 7825 sql_update = f""" 7826 UPDATE variants 7827 SET "INFO" = 7828 concat( 7829 CASE 7830 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 7831 THEN '' 7832 ELSE concat("INFO", ';') 7833 END, 7834 CASE 7835 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 7836 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 7837 THEN concat( 7838 '{snpeff_hgvs}=', 7839 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 7840 ) 7841 ELSE '' 7842 END 7843 ) 7844 FROM dataframe_snpeff_hgvs 7845 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 7846 7847 """ 7848 self.conn.execute(sql_update) 7849 7850 # Delete dataframe 7851 del dataframe_snpeff_hgvs 7852 gc.collect() 7853 7854 else: 7855 7856 log.warning( 7857 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 7858 ) 7859 7860 # Remove added columns 7861 for added_column in added_columns: 7862 self.drop_column(column=added_column) 7863 7864 def calculation_extract_nomen(self) -> None: 7865 """ 7866 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 7867 """ 7868 7869 # NOMEN field 7870 field_nomen_dict = "NOMEN_DICT" 7871 7872 # NOMEN structure 7873 nomen_dict = { 7874 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 7875 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 7876 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 7877 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 7878 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 7879 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 7880 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 7881 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 7882 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 7883 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 7884 } 7885 7886 # Param 7887 param = self.get_param() 7888 7889 # Prefix 7890 prefix = self.get_explode_infos_prefix() 7891 7892 # Header 7893 vcf_reader = self.get_header() 7894 7895 # Get HGVS field 7896 hgvs_field = ( 7897 param.get("calculation", {}) 7898 .get("calculations", {}) 7899 .get("NOMEN", {}) 7900 .get("options", {}) 7901 .get("hgvs_field", "hgvs") 7902 ) 7903 7904 # Get transcripts 7905 transcripts_file = ( 7906 param.get("calculation", {}) 7907 .get("calculations", {}) 7908 .get("NOMEN", {}) 7909 .get("options", {}) 7910 .get("transcripts", None) 7911 ) 7912 transcripts_file = full_path(transcripts_file) 7913 transcripts = [] 7914 if transcripts_file: 7915 if os.path.exists(transcripts_file): 7916 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 7917 transcripts = transcripts_dataframe.iloc[:, 0].tolist() 7918 else: 7919 log.error(f"Transcript file '{transcripts_file}' does NOT exist") 7920 raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist") 7921 7922 # Added columns 7923 added_columns = [] 7924 7925 # Explode HGVS field in column 7926 added_columns += self.explode_infos(fields=[hgvs_field]) 7927 7928 # extra infos 7929 extra_infos = self.get_extra_infos() 7930 extra_field = prefix + hgvs_field 7931 7932 if extra_field in extra_infos: 7933 7934 # Create dataframe 7935 dataframe_hgvs = self.get_query_to_df( 7936 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """ 7937 ) 7938 7939 # Create main NOMEN column 7940 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply( 7941 lambda x: find_nomen(str(x), transcripts=transcripts) 7942 ) 7943 7944 # Explode NOMEN Structure and create SQL set for update 7945 sql_nomen_fields = [] 7946 for nomen_field in nomen_dict: 7947 7948 # Explode each field into a column 7949 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 7950 lambda x: dict(x).get(nomen_field, "") 7951 ) 7952 7953 # Create VCF header field 7954 vcf_reader.infos[nomen_field] = vcf.parser._Info( 7955 nomen_field, 7956 ".", 7957 "String", 7958 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 7959 "howard calculation", 7960 "0", 7961 self.code_type_map.get("String"), 7962 ) 7963 sql_nomen_fields.append( 7964 f""" 7965 CASE 7966 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 7967 THEN concat( 7968 ';{nomen_field}=', 7969 dataframe_hgvs."{nomen_field}" 7970 ) 7971 ELSE '' 7972 END 7973 """ 7974 ) 7975 7976 # SQL set for update 7977 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 7978 7979 # Update 7980 sql_update = f""" 7981 UPDATE variants 7982 SET "INFO" = 7983 concat( 7984 CASE 7985 WHEN "INFO" IS NULL 7986 THEN '' 7987 ELSE "INFO" 7988 END, 7989 {sql_nomen_fields_set} 7990 ) 7991 FROM dataframe_hgvs 7992 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 7993 AND variants."POS" = dataframe_hgvs."POS" 7994 AND variants."REF" = dataframe_hgvs."REF" 7995 AND variants."ALT" = dataframe_hgvs."ALT" 7996 """ 7997 self.conn.execute(sql_update) 7998 7999 # Delete dataframe 8000 del dataframe_hgvs 8001 gc.collect() 8002 8003 # Remove added columns 8004 for added_column in added_columns: 8005 self.drop_column(column=added_column) 8006 8007 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 8008 """ 8009 The function `calculation_find_by_pipeline` performs a calculation to find the number of 8010 pipeline/sample for a variant and updates the variant information in a VCF file. 8011 8012 :param tag: The `tag` parameter is a string that represents the annotation field for the 8013 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 8014 VCF header and to update the corresponding field in the variants table, defaults to 8015 findbypipeline 8016 :type tag: str (optional) 8017 """ 8018 8019 # if FORMAT and samples 8020 if ( 8021 "FORMAT" in self.get_header_columns_as_list() 8022 and self.get_header_sample_list() 8023 ): 8024 8025 # findbypipeline annotation field 8026 findbypipeline_tag = tag 8027 8028 # VCF infos tags 8029 vcf_infos_tags = { 8030 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 8031 } 8032 8033 # Prefix 8034 prefix = self.get_explode_infos_prefix() 8035 8036 # Field 8037 findbypipeline_infos = prefix + findbypipeline_tag 8038 8039 # Variants table 8040 table_variants = self.get_table_variants() 8041 8042 # Header 8043 vcf_reader = self.get_header() 8044 8045 # Create variant id 8046 variant_id_column = self.get_variant_id_column() 8047 added_columns = [variant_id_column] 8048 8049 # variant_id, FORMAT and samples 8050 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8051 self.get_header_sample_list() 8052 ) 8053 8054 # Create dataframe 8055 dataframe_findbypipeline = self.get_query_to_df( 8056 f""" SELECT {samples_fields} FROM {table_variants} """ 8057 ) 8058 8059 # Create findbypipeline column 8060 dataframe_findbypipeline[findbypipeline_infos] = ( 8061 dataframe_findbypipeline.apply( 8062 lambda row: findbypipeline( 8063 row, samples=self.get_header_sample_list() 8064 ), 8065 axis=1, 8066 ) 8067 ) 8068 8069 # Add snpeff_hgvs to header 8070 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 8071 findbypipeline_tag, 8072 ".", 8073 "String", 8074 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 8075 "howard calculation", 8076 "0", 8077 self.code_type_map.get("String"), 8078 ) 8079 8080 # Update 8081 sql_update = f""" 8082 UPDATE variants 8083 SET "INFO" = 8084 concat( 8085 CASE 8086 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8087 THEN '' 8088 ELSE concat("INFO", ';') 8089 END, 8090 CASE 8091 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 8092 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 8093 THEN concat( 8094 '{findbypipeline_tag}=', 8095 dataframe_findbypipeline."{findbypipeline_infos}" 8096 ) 8097 ELSE '' 8098 END 8099 ) 8100 FROM dataframe_findbypipeline 8101 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 8102 """ 8103 self.conn.execute(sql_update) 8104 8105 # Remove added columns 8106 for added_column in added_columns: 8107 self.drop_column(column=added_column) 8108 8109 # Delete dataframe 8110 del dataframe_findbypipeline 8111 gc.collect() 8112 8113 def calculation_genotype_concordance(self) -> None: 8114 """ 8115 The function `calculation_genotype_concordance` calculates the genotype concordance for 8116 multi-caller VCF files and updates the variant information in the database. 8117 """ 8118 8119 # if FORMAT and samples 8120 if ( 8121 "FORMAT" in self.get_header_columns_as_list() 8122 and self.get_header_sample_list() 8123 ): 8124 8125 # genotypeconcordance annotation field 8126 genotypeconcordance_tag = "genotypeconcordance" 8127 8128 # VCF infos tags 8129 vcf_infos_tags = { 8130 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 8131 } 8132 8133 # Prefix 8134 prefix = self.get_explode_infos_prefix() 8135 8136 # Field 8137 genotypeconcordance_infos = prefix + genotypeconcordance_tag 8138 8139 # Variants table 8140 table_variants = self.get_table_variants() 8141 8142 # Header 8143 vcf_reader = self.get_header() 8144 8145 # Create variant id 8146 variant_id_column = self.get_variant_id_column() 8147 added_columns = [variant_id_column] 8148 8149 # variant_id, FORMAT and samples 8150 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8151 self.get_header_sample_list() 8152 ) 8153 8154 # Create dataframe 8155 dataframe_genotypeconcordance = self.get_query_to_df( 8156 f""" SELECT {samples_fields} FROM {table_variants} """ 8157 ) 8158 8159 # Create genotypeconcordance column 8160 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 8161 dataframe_genotypeconcordance.apply( 8162 lambda row: genotypeconcordance( 8163 row, samples=self.get_header_sample_list() 8164 ), 8165 axis=1, 8166 ) 8167 ) 8168 8169 # Add genotypeconcordance to header 8170 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 8171 genotypeconcordance_tag, 8172 ".", 8173 "String", 8174 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 8175 "howard calculation", 8176 "0", 8177 self.code_type_map.get("String"), 8178 ) 8179 8180 # Update 8181 sql_update = f""" 8182 UPDATE variants 8183 SET "INFO" = 8184 concat( 8185 CASE 8186 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8187 THEN '' 8188 ELSE concat("INFO", ';') 8189 END, 8190 CASE 8191 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 8192 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 8193 THEN concat( 8194 '{genotypeconcordance_tag}=', 8195 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 8196 ) 8197 ELSE '' 8198 END 8199 ) 8200 FROM dataframe_genotypeconcordance 8201 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 8202 """ 8203 self.conn.execute(sql_update) 8204 8205 # Remove added columns 8206 for added_column in added_columns: 8207 self.drop_column(column=added_column) 8208 8209 # Delete dataframe 8210 del dataframe_genotypeconcordance 8211 gc.collect() 8212 8213 def calculation_barcode(self, tag: str = "barcode") -> None: 8214 """ 8215 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 8216 updates the INFO field in the file with the calculated barcode values. 8217 """ 8218 8219 # if FORMAT and samples 8220 if ( 8221 "FORMAT" in self.get_header_columns_as_list() 8222 and self.get_header_sample_list() 8223 ): 8224 8225 # barcode annotation field 8226 if not tag: 8227 tag = "barcode" 8228 8229 # VCF infos tags 8230 vcf_infos_tags = { 8231 tag: "barcode calculation (VaRank)", 8232 } 8233 8234 # Prefix 8235 prefix = self.get_explode_infos_prefix() 8236 8237 # Field 8238 barcode_infos = prefix + tag 8239 8240 # Variants table 8241 table_variants = self.get_table_variants() 8242 8243 # Header 8244 vcf_reader = self.get_header() 8245 8246 # Create variant id 8247 variant_id_column = self.get_variant_id_column() 8248 added_columns = [variant_id_column] 8249 8250 # variant_id, FORMAT and samples 8251 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8252 self.get_header_sample_list() 8253 ) 8254 8255 # Create dataframe 8256 dataframe_barcode = self.get_query_to_df( 8257 f""" SELECT {samples_fields} FROM {table_variants} """ 8258 ) 8259 8260 # Create barcode column 8261 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8262 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 8263 ) 8264 8265 # Add barcode to header 8266 vcf_reader.infos[tag] = vcf.parser._Info( 8267 tag, 8268 ".", 8269 "String", 8270 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 8271 "howard calculation", 8272 "0", 8273 self.code_type_map.get("String"), 8274 ) 8275 8276 # Update 8277 sql_update = f""" 8278 UPDATE {table_variants} 8279 SET "INFO" = 8280 concat( 8281 CASE 8282 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8283 THEN '' 8284 ELSE concat("INFO", ';') 8285 END, 8286 CASE 8287 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 8288 AND dataframe_barcode."{barcode_infos}" NOT NULL 8289 THEN concat( 8290 '{tag}=', 8291 dataframe_barcode."{barcode_infos}" 8292 ) 8293 ELSE '' 8294 END 8295 ) 8296 FROM dataframe_barcode 8297 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8298 """ 8299 self.conn.execute(sql_update) 8300 8301 # Remove added columns 8302 for added_column in added_columns: 8303 self.drop_column(column=added_column) 8304 8305 # Delete dataframe 8306 del dataframe_barcode 8307 gc.collect() 8308 8309 def calculation_barcode_family(self, tag: str = "BCF") -> None: 8310 """ 8311 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 8312 and updates the INFO field in the file with the calculated barcode values. 8313 8314 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 8315 the barcode tag that will be added to the VCF file during the calculation process. If no value 8316 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 8317 :type tag: str (optional) 8318 """ 8319 8320 # if FORMAT and samples 8321 if ( 8322 "FORMAT" in self.get_header_columns_as_list() 8323 and self.get_header_sample_list() 8324 ): 8325 8326 # barcode annotation field 8327 if not tag: 8328 tag = "BCF" 8329 8330 # VCF infos tags 8331 vcf_infos_tags = { 8332 tag: "barcode family calculation", 8333 f"{tag}S": "barcode family samples", 8334 } 8335 8336 # Param 8337 param = self.get_param() 8338 log.debug(f"param={param}") 8339 8340 # Prefix 8341 prefix = self.get_explode_infos_prefix() 8342 8343 # PED param 8344 ped = ( 8345 param.get("calculation", {}) 8346 .get("calculations", {}) 8347 .get("BARCODEFAMILY", {}) 8348 .get("family_pedigree", None) 8349 ) 8350 log.debug(f"ped={ped}") 8351 8352 # Load PED 8353 if ped: 8354 8355 # Pedigree is a file 8356 if isinstance(ped, str) and os.path.exists(full_path(ped)): 8357 log.debug("Pedigree is file") 8358 with open(full_path(ped)) as ped: 8359 ped = json.load(ped) 8360 8361 # Pedigree is a string 8362 elif isinstance(ped, str): 8363 log.debug("Pedigree is str") 8364 try: 8365 ped = json.loads(ped) 8366 log.debug("Pedigree is json str") 8367 except ValueError as e: 8368 ped_samples = ped.split(",") 8369 ped = {} 8370 for ped_sample in ped_samples: 8371 ped[ped_sample] = ped_sample 8372 8373 # Pedigree is a dict 8374 elif isinstance(ped, dict): 8375 log.debug("Pedigree is dict") 8376 8377 # Pedigree is not well formatted 8378 else: 8379 msg_error = "Pedigree not well formatted" 8380 log.error(msg_error) 8381 raise ValueError(msg_error) 8382 8383 # Construct list 8384 ped_samples = list(ped.values()) 8385 8386 else: 8387 log.debug("Pedigree not defined. Take all samples") 8388 ped_samples = self.get_header_sample_list() 8389 ped = {} 8390 for ped_sample in ped_samples: 8391 ped[ped_sample] = ped_sample 8392 8393 # Check pedigree 8394 if not ped or len(ped) == 0: 8395 msg_error = f"Error in pedigree: samples {ped_samples}" 8396 log.error(msg_error) 8397 raise ValueError(msg_error) 8398 8399 # Log 8400 log.info( 8401 "Calculation 'BARCODEFAMILY' - Samples: " 8402 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 8403 ) 8404 log.debug(f"ped_samples={ped_samples}") 8405 8406 # Field 8407 barcode_infos = prefix + tag 8408 8409 # Variants table 8410 table_variants = self.get_table_variants() 8411 8412 # Header 8413 vcf_reader = self.get_header() 8414 8415 # Create variant id 8416 variant_id_column = self.get_variant_id_column() 8417 added_columns = [variant_id_column] 8418 8419 # variant_id, FORMAT and samples 8420 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8421 ped_samples 8422 ) 8423 8424 # Create dataframe 8425 dataframe_barcode = self.get_query_to_df( 8426 f""" SELECT {samples_fields} FROM {table_variants} """ 8427 ) 8428 8429 # Create barcode column 8430 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8431 lambda row: barcode(row, samples=ped_samples), axis=1 8432 ) 8433 8434 # Add barcode family to header 8435 # Add vaf_normalization to header 8436 vcf_reader.formats[tag] = vcf.parser._Format( 8437 id=tag, 8438 num=".", 8439 type="String", 8440 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 8441 type_code=self.code_type_map.get("String"), 8442 ) 8443 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 8444 id=f"{tag}S", 8445 num=".", 8446 type="String", 8447 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 8448 type_code=self.code_type_map.get("String"), 8449 ) 8450 8451 # Update 8452 # for sample in ped_samples: 8453 sql_update_set = [] 8454 for sample in self.get_header_sample_list() + ["FORMAT"]: 8455 if sample in ped_samples: 8456 value = f'dataframe_barcode."{barcode_infos}"' 8457 value_samples = "'" + ",".join(ped_samples) + "'" 8458 elif sample == "FORMAT": 8459 value = f"'{tag}'" 8460 value_samples = f"'{tag}S'" 8461 else: 8462 value = "'.'" 8463 value_samples = "'.'" 8464 format_regex = r"[a-zA-Z0-9\s]" 8465 sql_update_set.append( 8466 f""" 8467 "{sample}" = 8468 concat( 8469 CASE 8470 WHEN {table_variants}."{sample}" = './.' 8471 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 8472 ELSE {table_variants}."{sample}" 8473 END, 8474 ':', 8475 {value}, 8476 ':', 8477 {value_samples} 8478 ) 8479 """ 8480 ) 8481 8482 sql_update_set_join = ", ".join(sql_update_set) 8483 sql_update = f""" 8484 UPDATE {table_variants} 8485 SET {sql_update_set_join} 8486 FROM dataframe_barcode 8487 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8488 """ 8489 self.conn.execute(sql_update) 8490 8491 # Remove added columns 8492 for added_column in added_columns: 8493 self.drop_column(column=added_column) 8494 8495 # Delete dataframe 8496 del dataframe_barcode 8497 gc.collect() 8498 8499 def calculation_trio(self) -> None: 8500 """ 8501 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 8502 information to the INFO field of each variant. 8503 """ 8504 8505 # if FORMAT and samples 8506 if ( 8507 "FORMAT" in self.get_header_columns_as_list() 8508 and self.get_header_sample_list() 8509 ): 8510 8511 # trio annotation field 8512 trio_tag = "trio" 8513 8514 # VCF infos tags 8515 vcf_infos_tags = { 8516 "trio": "trio calculation", 8517 } 8518 8519 # Param 8520 param = self.get_param() 8521 8522 # Prefix 8523 prefix = self.get_explode_infos_prefix() 8524 8525 # Trio param 8526 trio_ped = ( 8527 param.get("calculation", {}) 8528 .get("calculations", {}) 8529 .get("TRIO", {}) 8530 .get("trio_pedigree", None) 8531 ) 8532 8533 # Load trio 8534 if trio_ped: 8535 8536 # Trio pedigree is a file 8537 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 8538 log.debug("TRIO pedigree is file") 8539 with open(full_path(trio_ped)) as trio_ped: 8540 trio_ped = json.load(trio_ped) 8541 8542 # Trio pedigree is a string 8543 elif isinstance(trio_ped, str): 8544 log.debug("TRIO pedigree is str") 8545 try: 8546 trio_ped = json.loads(trio_ped) 8547 log.debug("TRIO pedigree is json str") 8548 except ValueError as e: 8549 trio_samples = trio_ped.split(",") 8550 if len(trio_samples) == 3: 8551 trio_ped = { 8552 "father": trio_samples[0], 8553 "mother": trio_samples[1], 8554 "child": trio_samples[2], 8555 } 8556 log.debug("TRIO pedigree is list str") 8557 else: 8558 msg_error = "TRIO pedigree not well formatted" 8559 log.error(msg_error) 8560 raise ValueError(msg_error) 8561 8562 # Trio pedigree is a dict 8563 elif isinstance(trio_ped, dict): 8564 log.debug("TRIO pedigree is dict") 8565 8566 # Trio pedigree is not well formatted 8567 else: 8568 msg_error = "TRIO pedigree not well formatted" 8569 log.error(msg_error) 8570 raise ValueError(msg_error) 8571 8572 # Construct trio list 8573 trio_samples = [ 8574 trio_ped.get("father", ""), 8575 trio_ped.get("mother", ""), 8576 trio_ped.get("child", ""), 8577 ] 8578 8579 else: 8580 log.debug("TRIO pedigree not defined. Take the first 3 samples") 8581 samples_list = self.get_header_sample_list() 8582 if len(samples_list) >= 3: 8583 trio_samples = self.get_header_sample_list()[0:3] 8584 trio_ped = { 8585 "father": trio_samples[0], 8586 "mother": trio_samples[1], 8587 "child": trio_samples[2], 8588 } 8589 else: 8590 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 8591 log.error(msg_error) 8592 raise ValueError(msg_error) 8593 8594 # Check trio pedigree 8595 if not trio_ped or len(trio_ped) != 3: 8596 msg_error = f"Error in TRIO pedigree: {trio_ped}" 8597 log.error(msg_error) 8598 raise ValueError(msg_error) 8599 8600 # Log 8601 log.info( 8602 f"Calculation 'TRIO' - Samples: " 8603 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 8604 ) 8605 8606 # Field 8607 trio_infos = prefix + trio_tag 8608 8609 # Variants table 8610 table_variants = self.get_table_variants() 8611 8612 # Header 8613 vcf_reader = self.get_header() 8614 8615 # Create variant id 8616 variant_id_column = self.get_variant_id_column() 8617 added_columns = [variant_id_column] 8618 8619 # variant_id, FORMAT and samples 8620 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8621 self.get_header_sample_list() 8622 ) 8623 8624 # Create dataframe 8625 dataframe_trio = self.get_query_to_df( 8626 f""" SELECT {samples_fields} FROM {table_variants} """ 8627 ) 8628 8629 # Create trio column 8630 dataframe_trio[trio_infos] = dataframe_trio.apply( 8631 lambda row: trio(row, samples=trio_samples), axis=1 8632 ) 8633 8634 # Add trio to header 8635 vcf_reader.infos[trio_tag] = vcf.parser._Info( 8636 trio_tag, 8637 ".", 8638 "String", 8639 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 8640 "howard calculation", 8641 "0", 8642 self.code_type_map.get("String"), 8643 ) 8644 8645 # Update 8646 sql_update = f""" 8647 UPDATE {table_variants} 8648 SET "INFO" = 8649 concat( 8650 CASE 8651 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8652 THEN '' 8653 ELSE concat("INFO", ';') 8654 END, 8655 CASE 8656 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 8657 AND dataframe_trio."{trio_infos}" NOT NULL 8658 THEN concat( 8659 '{trio_tag}=', 8660 dataframe_trio."{trio_infos}" 8661 ) 8662 ELSE '' 8663 END 8664 ) 8665 FROM dataframe_trio 8666 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 8667 """ 8668 self.conn.execute(sql_update) 8669 8670 # Remove added columns 8671 for added_column in added_columns: 8672 self.drop_column(column=added_column) 8673 8674 # Delete dataframe 8675 del dataframe_trio 8676 gc.collect() 8677 8678 def calculation_vaf_normalization(self) -> None: 8679 """ 8680 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 8681 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 8682 :return: The function does not return anything. 8683 """ 8684 8685 # if FORMAT and samples 8686 if ( 8687 "FORMAT" in self.get_header_columns_as_list() 8688 and self.get_header_sample_list() 8689 ): 8690 8691 # vaf_normalization annotation field 8692 vaf_normalization_tag = "VAF" 8693 8694 # VCF infos tags 8695 vcf_infos_tags = { 8696 "VAF": "VAF Variant Frequency", 8697 } 8698 8699 # Prefix 8700 prefix = self.get_explode_infos_prefix() 8701 8702 # Variants table 8703 table_variants = self.get_table_variants() 8704 8705 # Header 8706 vcf_reader = self.get_header() 8707 8708 # Do not calculate if VAF already exists 8709 if "VAF" in vcf_reader.formats: 8710 log.debug("VAF already on genotypes") 8711 return 8712 8713 # Create variant id 8714 variant_id_column = self.get_variant_id_column() 8715 added_columns = [variant_id_column] 8716 8717 # variant_id, FORMAT and samples 8718 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8719 self.get_header_sample_list() 8720 ) 8721 8722 # Create dataframe 8723 dataframe_vaf_normalization = self.get_query_to_df( 8724 f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 8725 ) 8726 8727 vaf_normalization_set = [] 8728 8729 # for each sample vaf_normalization 8730 for sample in self.get_header_sample_list(): 8731 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 8732 lambda row: vaf_normalization(row, sample=sample), axis=1 8733 ) 8734 vaf_normalization_set.append( 8735 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 8736 ) 8737 8738 # Add VAF to FORMAT 8739 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 8740 "FORMAT" 8741 ].apply(lambda x: str(x) + ":VAF") 8742 vaf_normalization_set.append( 8743 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 8744 ) 8745 8746 # Add vaf_normalization to header 8747 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 8748 id=vaf_normalization_tag, 8749 num="1", 8750 type="Float", 8751 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 8752 type_code=self.code_type_map.get("Float"), 8753 ) 8754 8755 # Create fields to add in INFO 8756 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 8757 8758 # Update 8759 sql_update = f""" 8760 UPDATE {table_variants} 8761 SET {sql_vaf_normalization_set} 8762 FROM dataframe_vaf_normalization 8763 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 8764 8765 """ 8766 self.conn.execute(sql_update) 8767 8768 # Remove added columns 8769 for added_column in added_columns: 8770 self.drop_column(column=added_column) 8771 8772 # Delete dataframe 8773 del dataframe_vaf_normalization 8774 gc.collect() 8775 8776 def calculation_genotype_stats(self, info: str = "VAF") -> None: 8777 """ 8778 The `calculation_genotype_stats` function calculates genotype statistics for a given information 8779 field in a VCF file and updates the INFO column of the variants table with the calculated 8780 statistics. 8781 8782 :param info: The `info` parameter is a string that represents the type of information for which 8783 genotype statistics are calculated. It is used to generate various VCF info tags for the 8784 statistics, such as the number of occurrences, the list of values, the minimum value, the 8785 maximum value, the mean, the median, defaults to VAF 8786 :type info: str (optional) 8787 """ 8788 8789 # if FORMAT and samples 8790 if ( 8791 "FORMAT" in self.get_header_columns_as_list() 8792 and self.get_header_sample_list() 8793 ): 8794 8795 # vaf_stats annotation field 8796 vaf_stats_tag = info + "_stats" 8797 8798 # VCF infos tags 8799 vcf_infos_tags = { 8800 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 8801 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 8802 info + "_stats_min": f"genotype {info} Statistics - min {info}", 8803 info + "_stats_max": f"genotype {info} Statistics - max {info}", 8804 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 8805 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 8806 info 8807 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 8808 } 8809 8810 # Prefix 8811 prefix = self.get_explode_infos_prefix() 8812 8813 # Field 8814 vaf_stats_infos = prefix + vaf_stats_tag 8815 8816 # Variants table 8817 table_variants = self.get_table_variants() 8818 8819 # Header 8820 vcf_reader = self.get_header() 8821 8822 # Create variant id 8823 variant_id_column = self.get_variant_id_column() 8824 added_columns = [variant_id_column] 8825 8826 # variant_id, FORMAT and samples 8827 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8828 self.get_header_sample_list() 8829 ) 8830 8831 # Create dataframe 8832 dataframe_vaf_stats = self.get_query_to_df( 8833 f""" SELECT {samples_fields} FROM {table_variants} """ 8834 ) 8835 8836 # Create vaf_stats column 8837 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 8838 lambda row: genotype_stats( 8839 row, samples=self.get_header_sample_list(), info=info 8840 ), 8841 axis=1, 8842 ) 8843 8844 # List of vcf tags 8845 sql_vaf_stats_fields = [] 8846 8847 # Check all VAF stats infos 8848 for stat in vcf_infos_tags: 8849 8850 # Extract stats 8851 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 8852 lambda x: dict(x).get(stat, "") 8853 ) 8854 8855 # Add snpeff_hgvs to header 8856 vcf_reader.infos[stat] = vcf.parser._Info( 8857 stat, 8858 ".", 8859 "String", 8860 vcf_infos_tags.get(stat, "genotype statistics"), 8861 "howard calculation", 8862 "0", 8863 self.code_type_map.get("String"), 8864 ) 8865 8866 if len(sql_vaf_stats_fields): 8867 sep = ";" 8868 else: 8869 sep = "" 8870 8871 # Create fields to add in INFO 8872 sql_vaf_stats_fields.append( 8873 f""" 8874 CASE 8875 WHEN dataframe_vaf_stats."{stat}" NOT NULL 8876 THEN concat( 8877 '{sep}{stat}=', 8878 dataframe_vaf_stats."{stat}" 8879 ) 8880 ELSE '' 8881 END 8882 """ 8883 ) 8884 8885 # SQL set for update 8886 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 8887 8888 # Update 8889 sql_update = f""" 8890 UPDATE variants 8891 SET "INFO" = 8892 concat( 8893 CASE 8894 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8895 THEN '' 8896 ELSE concat("INFO", ';') 8897 END, 8898 {sql_vaf_stats_fields_set} 8899 ) 8900 FROM dataframe_vaf_stats 8901 WHERE variants."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 8902 8903 """ 8904 self.conn.execute(sql_update) 8905 8906 # Remove added columns 8907 for added_column in added_columns: 8908 self.drop_column(column=added_column) 8909 8910 # Delete dataframe 8911 del dataframe_vaf_stats 8912 gc.collect()
36 def __init__( 37 self, 38 conn=None, 39 input: str = None, 40 output: str = None, 41 config: dict = {}, 42 param: dict = {}, 43 load: bool = False, 44 ) -> None: 45 """ 46 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 47 header 48 49 :param conn: the connection to the database 50 :param input: the input file 51 :param output: the output file 52 :param config: a dictionary containing the configuration of the model 53 :param param: a dictionary containing the parameters of the model 54 """ 55 56 # Init variables 57 self.init_variables() 58 59 # Input 60 self.set_input(input) 61 62 # Config 63 self.set_config(config) 64 65 # Param 66 self.set_param(param) 67 68 # Output 69 self.set_output(output) 70 71 # connexion 72 self.set_connexion(conn) 73 74 # Header 75 self.set_header() 76 77 # Load data 78 if load: 79 self.load_data()
The function __init__ initializes the variables, sets the input, output, config, param, connexion and
header
Parameters
- conn: the connection to the database
- input: the input file
- output: the output file
- config: a dictionary containing the configuration of the model
- param: a dictionary containing the parameters of the model
81 def set_input(self, input: str = None) -> None: 82 """ 83 The function takes a file name as input, splits the file name into a name and an extension, and 84 then sets the input_name, input_extension, and input_format attributes of the class 85 86 :param input: The input file 87 """ 88 89 if input and not isinstance(input, str): 90 try: 91 self.input = input.name 92 except: 93 log.error(f"Input file '{input} in bad format") 94 raise ValueError(f"Input file '{input} in bad format") 95 else: 96 self.input = input 97 98 # Input format 99 if input: 100 input_name, input_extension = os.path.splitext(self.input) 101 self.input_name = input_name 102 self.input_extension = input_extension 103 self.input_format = self.input_extension.replace(".", "")
The function takes a file name as input, splits the file name into a name and an extension, and then sets the input_name, input_extension, and input_format attributes of the class
Parameters
- input: The input file
105 def set_config(self, config: dict) -> None: 106 """ 107 This function takes in a config object and sets it as the config object for the class 108 109 :param config: The configuration object 110 """ 111 self.config = config
This function takes in a config object and sets it as the config object for the class
Parameters
- config: The configuration object
113 def set_param(self, param: dict) -> None: 114 """ 115 This function takes in a param object and sets it as the param object for the class 116 117 :param param: The paramters object 118 """ 119 self.param = param
This function takes in a param object and sets it as the param object for the class
Parameters
- param: The paramters object
121 def init_variables(self) -> None: 122 """ 123 This function initializes the variables that will be used in the rest of the class 124 """ 125 self.prefix = "howard" 126 self.table_variants = "variants" 127 self.dataframe = None 128 129 self.comparison_map = { 130 "gt": ">", 131 "gte": ">=", 132 "lt": "<", 133 "lte": "<=", 134 "equals": "=", 135 "contains": "SIMILAR TO", 136 } 137 138 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 139 140 self.code_type_map_to_sql = { 141 "Integer": "INTEGER", 142 "String": "VARCHAR", 143 "Float": "FLOAT", 144 "Flag": "VARCHAR", 145 } 146 147 self.index_additionnal_fields = []
This function initializes the variables that will be used in the rest of the class
149 def get_indexing(self) -> bool: 150 """ 151 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 152 returns False. 153 :return: The value of the indexing parameter. 154 """ 155 return self.get_param().get("indexing", False)
It returns the value of the key "indexing" in the dictionary. If the key is not present, it returns False.
Returns
The value of the indexing parameter.
157 def get_connexion_config(self) -> dict: 158 """ 159 The function `get_connexion_config` returns a dictionary containing the configuration for a 160 connection, including the number of threads and memory limit. 161 :return: a dictionary containing the configuration for the Connexion library. 162 """ 163 164 # config 165 config = self.get_config() 166 167 # Connexion config 168 connexion_config = {} 169 threads = self.get_threads() 170 171 # Threads 172 if threads: 173 connexion_config["threads"] = threads 174 175 # Memory 176 # if config.get("memory", None): 177 # connexion_config["memory_limit"] = config.get("memory") 178 if self.get_memory(): 179 connexion_config["memory_limit"] = self.get_memory() 180 181 # Temporary directory 182 if config.get("tmp", None): 183 connexion_config["temp_directory"] = config.get("tmp") 184 185 # Access 186 if config.get("access", None): 187 access = config.get("access") 188 if access in ["RO"]: 189 access = "READ_ONLY" 190 elif access in ["RW"]: 191 access = "READ_WRITE" 192 connexion_db = self.get_connexion_db() 193 if connexion_db in ":memory:": 194 access = "READ_WRITE" 195 connexion_config["access_mode"] = access 196 197 return connexion_config
The function get_connexion_config returns a dictionary containing the configuration for a
connection, including the number of threads and memory limit.
Returns
a dictionary containing the configuration for the Connexion library.
199 def get_duckdb_settings(self) -> dict: 200 """ 201 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 202 string. 203 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 204 """ 205 206 # config 207 config = self.get_config() 208 209 # duckdb settings 210 duckdb_settings_dict = {} 211 if config.get("duckdb_settings", None): 212 duckdb_settings = config.get("duckdb_settings") 213 duckdb_settings = full_path(duckdb_settings) 214 # duckdb setting is a file 215 if os.path.exists(duckdb_settings): 216 with open(duckdb_settings) as json_file: 217 duckdb_settings_dict = yaml.safe_load(json_file) 218 # duckdb settings is a string 219 else: 220 duckdb_settings_dict = json.loads(duckdb_settings) 221 222 return duckdb_settings_dict
The function get_duckdb_settings retrieves DuckDB settings from a configuration file or a
string.
Returns
The function
get_duckdb_settingsreturns a dictionary objectduckdb_settings_dict.
224 def set_connexion_db(self) -> str: 225 """ 226 The function `set_connexion_db` returns the appropriate database connection string based on the 227 input format and connection type. 228 :return: the value of the variable `connexion_db`. 229 """ 230 231 # Default connexion db 232 default_connexion_db = ":memory:" 233 234 # Find connexion db 235 if self.get_input_format() in ["db", "duckdb"]: 236 connexion_db = self.get_input() 237 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 238 connexion_db = default_connexion_db 239 elif self.get_connexion_type() in ["tmpfile"]: 240 tmp_name = tempfile.mkdtemp( 241 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 242 ) 243 connexion_db = f"{tmp_name}/tmp.db" 244 elif self.get_connexion_type() != "": 245 connexion_db = self.get_connexion_type() 246 else: 247 connexion_db = default_connexion_db 248 249 # Set connexion db 250 self.connexion_db = connexion_db 251 252 return connexion_db
The function set_connexion_db returns the appropriate database connection string based on the
input format and connection type.
Returns
the value of the variable
connexion_db.
254 def set_connexion(self, conn) -> None: 255 """ 256 It creates a connection to the database 257 258 :param conn: The connection to the database. If not provided, a new connection to an in-memory 259 database is created 260 """ 261 262 # Connexion db 263 connexion_db = self.set_connexion_db() 264 265 # Connexion config 266 connexion_config = self.get_connexion_config() 267 268 # Connexion format 269 connexion_format = self.get_config().get("connexion_format", "duckdb") 270 # Set connexion format 271 self.connexion_format = connexion_format 272 273 # Connexion 274 if not conn: 275 if connexion_format in ["duckdb"]: 276 conn = duckdb.connect(connexion_db, config=connexion_config) 277 # duckDB settings 278 duckdb_settings = self.get_duckdb_settings() 279 if duckdb_settings: 280 for setting in duckdb_settings: 281 setting_value = duckdb_settings.get(setting) 282 if isinstance(setting_value, str): 283 setting_value = f"'{setting_value}'" 284 conn.execute(f"PRAGMA {setting}={setting_value};") 285 elif connexion_format in ["sqlite"]: 286 conn = sqlite3.connect(connexion_db) 287 288 # Set connexion 289 self.conn = conn 290 291 # Log 292 log.debug(f"connexion_format: {connexion_format}") 293 log.debug(f"connexion_db: {connexion_db}") 294 log.debug(f"connexion config: {connexion_config}") 295 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
It creates a connection to the database
Parameters
- conn: The connection to the database. If not provided, a new connection to an in-memory database is created
297 def set_output(self, output: str = None) -> None: 298 """ 299 If the config file has an output key, set the output to the value of that key. Otherwise, set 300 the output to the input 301 302 :param output: The name of the output file 303 """ 304 305 if output and not isinstance(output, str): 306 self.output = output.name 307 else: 308 self.output = output 309 310 # Output format 311 if self.output: 312 output_name, output_extension = os.path.splitext(self.output) 313 self.output_name = output_name 314 self.output_extension = output_extension 315 self.output_format = self.output_extension.replace(".", "") 316 else: 317 self.output_name = None 318 self.output_extension = None 319 self.output_format = None
If the config file has an output key, set the output to the value of that key. Otherwise, set the output to the input
Parameters
- output: The name of the output file
321 def set_header(self) -> None: 322 """ 323 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 324 """ 325 326 input_file = self.get_input() 327 default_header_list = [ 328 "##fileformat=VCFv4.2", 329 "#CHROM POS ID REF ALT QUAL FILTER INFO", 330 ] 331 332 # Full path 333 input_file = full_path(input_file) 334 335 if input_file: 336 337 input_format = self.get_input_format() 338 input_compressed = self.get_input_compressed() 339 config = self.get_config() 340 header_list = default_header_list 341 if input_format in [ 342 "vcf", 343 "hdr", 344 "tsv", 345 "csv", 346 "psv", 347 "parquet", 348 "db", 349 "duckdb", 350 ]: 351 # header provided in param 352 if config.get("header_file", None): 353 with open(config.get("header_file"), "rt") as f: 354 header_list = self.read_vcf_header(f) 355 # within a vcf file format (header within input file itsself) 356 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 357 # within a compressed vcf file format (.vcf.gz) 358 if input_compressed: 359 with bgzf.open(input_file, "rt") as f: 360 header_list = self.read_vcf_header(f) 361 # within an uncompressed vcf file format (.vcf) 362 else: 363 with open(input_file, "rt") as f: 364 header_list = self.read_vcf_header(f) 365 # header provided in default external file .hdr 366 elif os.path.exists((input_file + ".hdr")): 367 with open(input_file + ".hdr", "rt") as f: 368 header_list = self.read_vcf_header(f) 369 else: 370 try: # Try to get header info fields and file columns 371 372 with tempfile.TemporaryDirectory() as tmpdir: 373 374 # Create database 375 db_for_header = Database(database=input_file) 376 377 # Get header columns for infos fields 378 db_header_from_columns = ( 379 db_for_header.get_header_from_columns() 380 ) 381 382 # Get real columns in the file 383 db_header_columns = db_for_header.get_columns() 384 385 # Write header file 386 header_file_tmp = os.path.join(tmpdir, "header") 387 f = open(header_file_tmp, "w") 388 vcf.Writer(f, db_header_from_columns) 389 f.close() 390 391 # Replace #CHROM line with rel columns 392 header_list = db_for_header.read_header_file( 393 header_file=header_file_tmp 394 ) 395 header_list[-1] = "\t".join(db_header_columns) 396 397 except: 398 399 log.warning( 400 f"No header for file {input_file}. Set as default VCF header" 401 ) 402 header_list = default_header_list 403 404 else: # try for unknown format ? 405 406 log.error(f"Input file format '{input_format}' not available") 407 raise ValueError(f"Input file format '{input_format}' not available") 408 409 if not header_list: 410 header_list = default_header_list 411 412 # header as list 413 self.header_list = header_list 414 415 # header as VCF object 416 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 417 418 else: 419 420 self.header_list = None 421 self.header_vcf = None
It reads the header of a VCF file and stores it as a list of strings and as a VCF object
423 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 424 """ 425 > The function `get_query_to_df` takes a query as a string and returns a pandas dataframe 426 427 :param query: str = "" 428 :type query: str 429 :return: A dataframe 430 """ 431 432 # Connexion format 433 connexion_format = self.get_connexion_format() 434 435 # Limit in query 436 if limit: 437 pd.set_option("display.max_rows", limit) 438 if connexion_format in ["duckdb"]: 439 df = ( 440 self.conn.execute(query) 441 .fetch_record_batch(limit) 442 .read_next_batch() 443 .to_pandas() 444 ) 445 elif connexion_format in ["sqlite"]: 446 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 447 448 # Full query 449 else: 450 if connexion_format in ["duckdb"]: 451 df = self.conn.execute(query).df() 452 elif connexion_format in ["sqlite"]: 453 df = pd.read_sql_query(query, self.conn) 454 455 return df
The function
get_query_to_dftakes a query as a string and returns a pandas dataframe
Parameters
- query: str = ""
Returns
A dataframe
457 def get_overview(self) -> None: 458 """ 459 The function prints the input, output, config, and dataframe of the current object 460 """ 461 table_variants_from = self.get_table_variants(clause="from") 462 sql_columns = self.get_header_columns_as_sql() 463 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 464 df = self.get_query_to_df(sql_query_export) 465 log.info( 466 "Input: " 467 + str(self.get_input()) 468 + " [" 469 + str(str(self.get_input_format())) 470 + "]" 471 ) 472 log.info( 473 "Output: " 474 + str(self.get_output()) 475 + " [" 476 + str(str(self.get_output_format())) 477 + "]" 478 ) 479 log.info("Config: ") 480 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 481 "\n" 482 ): 483 log.info("\t" + str(d)) 484 log.info("Param: ") 485 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 486 "\n" 487 ): 488 log.info("\t" + str(d)) 489 log.info("Sample list: " + str(self.get_header_sample_list())) 490 log.info("Dataframe: ") 491 for d in str(df).split("\n"): 492 log.info("\t" + str(d)) 493 494 # garbage collector 495 del df 496 gc.collect() 497 498 return None
The function prints the input, output, config, and dataframe of the current object
500 def get_stats(self) -> dict: 501 """ 502 The `get_stats` function calculates and returns various statistics of the current object, 503 including information about the input file, variants, samples, header fields, quality, and 504 SNVs/InDels. 505 :return: a dictionary containing various statistics of the current object. The dictionary has 506 the following structure: 507 """ 508 509 # Log 510 log.info(f"Stats Calculation...") 511 512 # table varaints 513 table_variants_from = self.get_table_variants() 514 515 # stats dict 516 stats = {"Infos": {}} 517 518 ### File 519 input_file = self.get_input() 520 stats["Infos"]["Input file"] = input_file 521 522 # Header 523 header_infos = self.get_header().infos 524 header_formats = self.get_header().formats 525 header_infos_list = list(header_infos) 526 header_formats_list = list(header_formats) 527 528 ### Variants 529 530 stats["Variants"] = {} 531 532 # Variants by chr 533 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 534 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 535 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 536 by=["CHROM"], kind="quicksort" 537 ) 538 539 # Total number of variants 540 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 541 542 # Calculate percentage 543 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 544 lambda x: (x / nb_of_variants) 545 ) 546 547 stats["Variants"]["Number of variants by chromosome"] = ( 548 nb_of_variants_by_chrom.to_dict(orient="index") 549 ) 550 551 stats["Infos"]["Number of variants"] = int(nb_of_variants) 552 553 ### Samples 554 555 # Init 556 samples = {} 557 nb_of_samples = 0 558 559 # Check Samples 560 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 561 log.debug(f"Check samples...") 562 for sample in self.get_header_sample_list(): 563 sql_query_samples = f""" 564 SELECT '{sample}' as sample, 565 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 566 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 567 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 568 FROM {table_variants_from} 569 WHERE ( 570 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 571 AND 572 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 573 ) 574 GROUP BY genotype 575 """ 576 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 577 sample_genotype_count = sql_query_genotype_df["count"].sum() 578 if len(sql_query_genotype_df): 579 nb_of_samples += 1 580 samples[f"{sample} - {sample_genotype_count} variants"] = ( 581 sql_query_genotype_df.to_dict(orient="index") 582 ) 583 584 stats["Samples"] = samples 585 stats["Infos"]["Number of samples"] = nb_of_samples 586 587 # # 588 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 589 # stats["Infos"]["Number of samples"] = nb_of_samples 590 # elif nb_of_samples: 591 # stats["Infos"]["Number of samples"] = "not a VCF format" 592 593 ### INFO and FORMAT fields 594 header_types_df = {} 595 header_types_list = { 596 "List of INFO fields": header_infos, 597 "List of FORMAT fields": header_formats, 598 } 599 i = 0 600 for header_type in header_types_list: 601 602 header_type_infos = header_types_list.get(header_type) 603 header_infos_dict = {} 604 605 for info in header_type_infos: 606 607 i += 1 608 header_infos_dict[i] = {} 609 610 # ID 611 header_infos_dict[i]["id"] = info 612 613 # num 614 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 615 if header_type_infos[info].num in genotype_map.keys(): 616 header_infos_dict[i]["Number"] = genotype_map.get( 617 header_type_infos[info].num 618 ) 619 else: 620 header_infos_dict[i]["Number"] = header_type_infos[info].num 621 622 # type 623 if header_type_infos[info].type: 624 header_infos_dict[i]["Type"] = header_type_infos[info].type 625 else: 626 header_infos_dict[i]["Type"] = "." 627 628 # desc 629 if header_type_infos[info].desc != None: 630 header_infos_dict[i]["Description"] = header_type_infos[info].desc 631 else: 632 header_infos_dict[i]["Description"] = "" 633 634 if len(header_infos_dict): 635 header_types_df[header_type] = pd.DataFrame.from_dict( 636 header_infos_dict, orient="index" 637 ).to_dict(orient="index") 638 639 # Stats 640 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 641 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 642 stats["Header"] = header_types_df 643 644 ### QUAL 645 if "QUAL" in self.get_header_columns(): 646 sql_query_qual = f""" 647 SELECT 648 avg(CAST(QUAL AS INTEGER)) AS Average, 649 min(CAST(QUAL AS INTEGER)) AS Minimum, 650 max(CAST(QUAL AS INTEGER)) AS Maximum, 651 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 652 median(CAST(QUAL AS INTEGER)) AS Median, 653 variance(CAST(QUAL AS INTEGER)) AS Variance 654 FROM {table_variants_from} 655 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 656 """ 657 658 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 659 stats["Quality"] = {"Stats": qual} 660 661 ### SNV and InDel 662 663 sql_query_snv = f""" 664 665 SELECT Type, count FROM ( 666 667 SELECT 668 'Total' AS Type, 669 count(*) AS count 670 FROM {table_variants_from} 671 672 UNION 673 674 SELECT 675 'MNV' AS Type, 676 count(*) AS count 677 FROM {table_variants_from} 678 WHERE len(REF) > 1 AND len(ALT) > 1 679 AND len(REF) = len(ALT) 680 681 UNION 682 683 SELECT 684 'InDel' AS Type, 685 count(*) AS count 686 FROM {table_variants_from} 687 WHERE len(REF) > 1 OR len(ALT) > 1 688 AND len(REF) != len(ALT) 689 690 UNION 691 692 SELECT 693 'SNV' AS Type, 694 count(*) AS count 695 FROM {table_variants_from} 696 WHERE len(REF) = 1 AND len(ALT) = 1 697 698 ) 699 700 ORDER BY count DESC 701 702 """ 703 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 704 705 sql_query_snv_substitution = f""" 706 SELECT 707 concat(REF, '>', ALT) AS 'Substitution', 708 count(*) AS count 709 FROM {table_variants_from} 710 WHERE len(REF) = 1 AND len(ALT) = 1 711 GROUP BY REF, ALT 712 ORDER BY count(*) DESC 713 """ 714 snv_substitution = ( 715 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 716 ) 717 stats["Variants"]["Counts"] = snv_indel 718 stats["Variants"]["Substitutions"] = snv_substitution 719 720 return stats
The get_stats function calculates and returns various statistics of the current object,
including information about the input file, variants, samples, header fields, quality, and
SNVs/InDels.
Returns
a dictionary containing various statistics of the current object. The dictionary has the following structure:
722 def stats_to_file(self, file: str = None) -> str: 723 """ 724 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 725 into a JSON object, and writes the JSON object to the specified file. 726 727 :param file: The `file` parameter is a string that represents the file path where the JSON data 728 will be written 729 :type file: str 730 :return: the name of the file that was written to. 731 """ 732 733 # Get stats 734 stats = self.get_stats() 735 736 # Serializing json 737 json_object = json.dumps(stats, indent=4) 738 739 # Writing to sample.json 740 with open(file, "w") as outfile: 741 outfile.write(json_object) 742 743 return file
The function stats_to_file takes a file name as input, retrieves statistics, serializes them
into a JSON object, and writes the JSON object to the specified file.
Parameters
- file: The
fileparameter is a string that represents the file path where the JSON data will be written
Returns
the name of the file that was written to.
745 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 746 """ 747 The `print_stats` function generates a markdown file and prints the statistics contained in a 748 JSON file in a formatted manner. 749 750 :param output_file: The `output_file` parameter is a string that specifies the path and filename 751 of the output file where the stats will be printed in Markdown format. If no `output_file` is 752 provided, a temporary directory will be created and the stats will be saved in a file named 753 "stats.md" within that 754 :type output_file: str 755 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 756 file where the statistics will be saved. If no value is provided, a temporary directory will be 757 created and a default file name "stats.json" will be used 758 :type json_file: str 759 :return: The function `print_stats` does not return any value. It has a return type annotation 760 of `None`. 761 """ 762 763 # Full path 764 output_file = full_path(output_file) 765 json_file = full_path(json_file) 766 767 with tempfile.TemporaryDirectory() as tmpdir: 768 769 # Files 770 if not output_file: 771 output_file = os.path.join(tmpdir, "stats.md") 772 if not json_file: 773 json_file = os.path.join(tmpdir, "stats.json") 774 775 # Create folders 776 if not os.path.exists(os.path.dirname(output_file)): 777 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 778 if not os.path.exists(os.path.dirname(json_file)): 779 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 780 781 # Create stats JSON file 782 stats_file = self.stats_to_file(file=json_file) 783 784 # Print stats file 785 with open(stats_file) as f: 786 stats = yaml.safe_load(f) 787 788 # Output 789 output_title = [] 790 output_index = [] 791 output = [] 792 793 # Title 794 output_title.append("# HOWARD Stats") 795 796 # Index 797 output_index.append("## Index") 798 799 # Process sections 800 for section in stats: 801 infos = stats.get(section) 802 section_link = "#" + section.lower().replace(" ", "-") 803 output.append(f"## {section}") 804 output_index.append(f"- [{section}]({section_link})") 805 806 if len(infos): 807 for info in infos: 808 try: 809 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 810 is_df = True 811 except: 812 try: 813 df = pd.DataFrame.from_dict( 814 json.loads((infos.get(info))), orient="index" 815 ) 816 is_df = True 817 except: 818 is_df = False 819 if is_df: 820 output.append(f"### {info}") 821 info_link = "#" + info.lower().replace(" ", "-") 822 output_index.append(f" - [{info}]({info_link})") 823 output.append(f"{df.to_markdown(index=False)}") 824 else: 825 output.append(f"- {info}: {infos.get(info)}") 826 else: 827 output.append(f"NA") 828 829 # Write stats in markdown file 830 with open(output_file, "w") as fp: 831 for item in output_title: 832 fp.write("%s\n" % item) 833 for item in output_index: 834 fp.write("%s\n" % item) 835 for item in output: 836 fp.write("%s\n" % item) 837 838 # Output stats in markdown 839 print("") 840 print("\n\n".join(output_title)) 841 print("") 842 print("\n\n".join(output)) 843 print("") 844 845 return None
The print_stats function generates a markdown file and prints the statistics contained in a
JSON file in a formatted manner.
Parameters
- output_file: The
output_fileparameter is a string that specifies the path and filename of the output file where the stats will be printed in Markdown format. If nooutput_fileis provided, a temporary directory will be created and the stats will be saved in a file named "stats.md" within that - json_file: The
json_fileparameter is a string that represents the path to the JSON file where the statistics will be saved. If no value is provided, a temporary directory will be created and a default file name "stats.json" will be used
Returns
The function
print_statsdoes not return any value. It has a return type annotation ofNone.
847 def get_input(self) -> str: 848 """ 849 It returns the value of the input variable. 850 :return: The input is being returned. 851 """ 852 return self.input
It returns the value of the input variable.
Returns
The input is being returned.
854 def get_input_format(self, input_file: str = None) -> str: 855 """ 856 It returns the format of the input variable. 857 :return: The format is being returned. 858 """ 859 if not input_file: 860 input_file = self.get_input() 861 input_format = get_file_format(input_file) 862 return input_format
It returns the format of the input variable.
Returns
The format is being returned.
864 def get_input_compressed(self, input_file: str = None) -> str: 865 """ 866 It returns the format of the input variable. 867 :return: The format is being returned. 868 """ 869 if not input_file: 870 input_file = self.get_input() 871 input_compressed = get_file_compressed(input_file) 872 return input_compressed
It returns the format of the input variable.
Returns
The format is being returned.
874 def get_output(self) -> str: 875 """ 876 It returns the output of the neuron. 877 :return: The output of the neural network. 878 """ 879 return self.output
It returns the output of the neuron.
Returns
The output of the neural network.
881 def get_output_format(self, output_file: str = None) -> str: 882 """ 883 It returns the format of the input variable. 884 :return: The format is being returned. 885 """ 886 if not output_file: 887 output_file = self.get_output() 888 output_format = get_file_format(output_file) 889 890 return output_format
It returns the format of the input variable.
Returns
The format is being returned.
892 def get_config(self) -> dict: 893 """ 894 It returns the config 895 :return: The config variable is being returned. 896 """ 897 return self.config
It returns the config
Returns
The config variable is being returned.
899 def get_param(self) -> dict: 900 """ 901 It returns the param 902 :return: The param variable is being returned. 903 """ 904 return self.param
It returns the param
Returns
The param variable is being returned.
906 def get_connexion_db(self) -> str: 907 """ 908 It returns the connexion_db attribute of the object 909 :return: The connexion_db is being returned. 910 """ 911 return self.connexion_db
It returns the connexion_db attribute of the object
Returns
The connexion_db is being returned.
913 def get_prefix(self) -> str: 914 """ 915 It returns the prefix of the object. 916 :return: The prefix is being returned. 917 """ 918 return self.prefix
It returns the prefix of the object.
Returns
The prefix is being returned.
920 def get_table_variants(self, clause: str = "select") -> str: 921 """ 922 This function returns the table_variants attribute of the object 923 924 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 925 defaults to select (optional) 926 :return: The table_variants attribute of the object. 927 """ 928 929 # Access 930 access = self.get_config().get("access", None) 931 932 # Clauses "select", "where", "update" 933 if clause in ["select", "where", "update"]: 934 table_variants = self.table_variants 935 # Clause "from" 936 elif clause in ["from"]: 937 # For Read Only 938 if self.get_input_format() in ["parquet"] and access in ["RO"]: 939 input_file = self.get_input() 940 table_variants = f"'{input_file}' as variants" 941 # For Read Write 942 else: 943 table_variants = f"{self.table_variants} as variants" 944 else: 945 table_variants = self.table_variants 946 return table_variants
This function returns the table_variants attribute of the object
Parameters
- clause: the type of clause the table will be used. Either "select" or "from" (optional), defaults to select (optional)
Returns
The table_variants attribute of the object.
948 def get_tmp_dir(self) -> str: 949 """ 950 The function `get_tmp_dir` returns the temporary directory path based on configuration 951 parameters or a default path. 952 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 953 configuration, parameters, and a default value of "/tmp". 954 """ 955 956 return get_tmp( 957 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 958 )
The function get_tmp_dir returns the temporary directory path based on configuration
parameters or a default path.
Returns
The
get_tmp_dirmethod is returning the temporary directory path based on the configuration, parameters, and a default value of "/tmp".
960 def get_connexion_type(self) -> str: 961 """ 962 If the connexion type is not in the list of allowed connexion types, raise a ValueError 963 964 :return: The connexion type is being returned. 965 """ 966 return self.get_config().get("connexion_type", "memory")
If the connexion type is not in the list of allowed connexion types, raise a ValueError
Returns
The connexion type is being returned.
968 def get_connexion(self): 969 """ 970 It returns the connection object 971 972 :return: The connection object. 973 """ 974 return self.conn
It returns the connection object
Returns
The connection object.
976 def close_connexion(self) -> None: 977 """ 978 This function closes the connection to the database. 979 :return: The connection is being closed. 980 """ 981 return self.conn.close()
This function closes the connection to the database.
Returns
The connection is being closed.
983 def get_header(self, type: str = "vcf"): 984 """ 985 This function returns the header of the VCF file as a list of strings 986 987 :param type: the type of header you want to get, defaults to vcf (optional) 988 :return: The header of the vcf file. 989 """ 990 991 if self.header_vcf: 992 if type == "vcf": 993 return self.header_vcf 994 elif type == "list": 995 return self.header_list 996 else: 997 if type == "vcf": 998 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 999 return header 1000 elif type == "list": 1001 return vcf_required
This function returns the header of the VCF file as a list of strings
Parameters
- type: the type of header you want to get, defaults to vcf (optional)
Returns
The header of the vcf file.
1003 def get_header_length(self, file: str = None) -> int: 1004 """ 1005 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1006 line. 1007 1008 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1009 header file. If this argument is provided, the function will read the header from the specified 1010 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1011 :type file: str 1012 :return: the length of the header list, excluding the #CHROM line. 1013 """ 1014 1015 if file: 1016 return len(self.read_vcf_header_file(file=file)) - 1 1017 elif self.get_header(type="list"): 1018 return len(self.get_header(type="list")) - 1 1019 else: 1020 return 0
The function get_header_length returns the length of the header list, excluding the #CHROM
line.
Parameters
- file: The
fileparameter is an optional argument that specifies the path to a VCF header file. If this argument is provided, the function will read the header from the specified file and return the length of the header list minus 1 (to exclude the #CHROM line)
Returns
the length of the header list, excluding the #CHROM line.
1022 def get_header_columns(self) -> str: 1023 """ 1024 This function returns the header list of a VCF 1025 1026 :return: The length of the header list. 1027 """ 1028 if self.get_header(): 1029 return self.get_header(type="list")[-1] 1030 else: 1031 return ""
This function returns the header list of a VCF
Returns
The length of the header list.
1033 def get_header_columns_as_list(self) -> list: 1034 """ 1035 This function returns the header list of a VCF 1036 1037 :return: The length of the header list. 1038 """ 1039 if self.get_header(): 1040 return self.get_header_columns().strip().split("\t") 1041 else: 1042 return []
This function returns the header list of a VCF
Returns
The length of the header list.
1044 def get_header_columns_as_sql(self) -> str: 1045 """ 1046 This function retruns header length (without #CHROM line) 1047 1048 :return: The length of the header list. 1049 """ 1050 sql_column_list = [] 1051 for col in self.get_header_columns_as_list(): 1052 sql_column_list.append(f'"{col}"') 1053 return ",".join(sql_column_list)
This function retruns header length (without #CHROM line)
Returns
The length of the header list.
1055 def get_header_sample_list(self) -> list: 1056 """ 1057 This function retruns header length (without #CHROM line) 1058 1059 :return: The length of the header list. 1060 """ 1061 return self.header_vcf.samples
This function retruns header length (without #CHROM line)
Returns
The length of the header list.
1063 def get_verbose(self) -> bool: 1064 """ 1065 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1066 exist 1067 1068 :return: The value of the key "verbose" in the config dictionary. 1069 """ 1070 return self.get_config().get("verbose", False)
It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't exist
Returns
The value of the key "verbose" in the config dictionary.
1072 def get_connexion_format(self) -> str: 1073 """ 1074 It returns the connexion format of the object. 1075 :return: The connexion_format is being returned. 1076 """ 1077 connexion_format = self.connexion_format 1078 if connexion_format not in ["duckdb", "sqlite"]: 1079 log.error(f"Unknown connexion format {connexion_format}") 1080 raise ValueError(f"Unknown connexion format {connexion_format}") 1081 else: 1082 return connexion_format
It returns the connexion format of the object.
Returns
The connexion_format is being returned.
1084 def insert_file_to_table( 1085 self, 1086 file, 1087 columns: str, 1088 header_len: int = 0, 1089 sep: str = "\t", 1090 chunksize: int = 1000000, 1091 ) -> None: 1092 """ 1093 The function reads a file in chunks, and inserts each chunk into a table 1094 1095 :param file: the file to be loaded 1096 :param columns: a string of the column names separated by commas 1097 :param header_len: the number of lines to skip at the beginning of the file, defaults to 0 1098 (optional) 1099 :param sep: the separator used in the file, defaults to \t (optional) 1100 :param chunksize: The number of rows to read in at a time, defaults to 1000000 (optional) 1101 """ 1102 1103 # Config 1104 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1105 connexion_format = self.get_connexion_format() 1106 1107 log.debug("chunksize: " + str(chunksize)) 1108 1109 if chunksize: 1110 for chunk in pd.read_csv( 1111 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1112 ): 1113 if connexion_format in ["duckdb"]: 1114 sql_insert_into = ( 1115 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1116 ) 1117 self.conn.execute(sql_insert_into) 1118 elif connexion_format in ["sqlite"]: 1119 chunk.to_sql("variants", self.conn, if_exists="append", index=False)
The function reads a file in chunks, and inserts each chunk into a table
Parameters
- file: the file to be loaded
- columns: a string of the column names separated by commas
- header_len: the number of lines to skip at the beginning of the file, defaults to 0 (optional)
- sep: the separator used in the file, defaults to (optional)
- chunksize: The number of rows to read in at a time, defaults to 1000000 (optional)
1121 def load_data( 1122 self, 1123 input_file: str = None, 1124 drop_variants_table: bool = False, 1125 sample_size: int = 20480, 1126 ) -> None: 1127 """ 1128 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1129 table before loading the data and specify a sample size. 1130 1131 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1132 table 1133 :type input_file: str 1134 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1135 determines whether the variants table should be dropped before loading the data. If set to 1136 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1137 not be dropped, defaults to False 1138 :type drop_variants_table: bool (optional) 1139 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1140 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1141 20480 1142 :type sample_size: int (optional) 1143 """ 1144 1145 log.info("Loading...") 1146 1147 # change input file 1148 if input_file: 1149 self.set_input(input_file) 1150 self.set_header() 1151 1152 # drop variants table 1153 if drop_variants_table: 1154 self.drop_variants_table() 1155 1156 # get table variants 1157 table_variants = self.get_table_variants() 1158 1159 # Access 1160 access = self.get_config().get("access", None) 1161 log.debug(f"access: {access}") 1162 1163 # Input format and compress 1164 input_format = self.get_input_format() 1165 input_compressed = self.get_input_compressed() 1166 log.debug(f"input_format: {input_format}") 1167 log.debug(f"input_compressed: {input_compressed}") 1168 1169 # input_compressed_format 1170 if input_compressed: 1171 input_compressed_format = "gzip" 1172 else: 1173 input_compressed_format = "none" 1174 log.debug(f"input_compressed_format: {input_compressed_format}") 1175 1176 # Connexion format 1177 connexion_format = self.get_connexion_format() 1178 1179 # Sample size 1180 if not sample_size: 1181 sample_size = -1 1182 log.debug(f"sample_size: {sample_size}") 1183 1184 # Load data 1185 log.debug(f"Load Data from {input_format}") 1186 1187 # DuckDB connexion 1188 if connexion_format in ["duckdb"]: 1189 1190 # Database already exists 1191 if self.input_format in ["db", "duckdb"]: 1192 1193 if connexion_format in ["duckdb"]: 1194 log.debug(f"Input file format '{self.input_format}' duckDB") 1195 else: 1196 log.error( 1197 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1198 ) 1199 raise ValueError( 1200 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1201 ) 1202 1203 # Load from existing database format 1204 else: 1205 1206 try: 1207 # Create Table or View 1208 database = Database(database=self.input) 1209 sql_from = database.get_sql_from(sample_size=sample_size) 1210 1211 if access in ["RO"]: 1212 sql_load = ( 1213 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1214 ) 1215 else: 1216 sql_load = ( 1217 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1218 ) 1219 self.conn.execute(sql_load) 1220 1221 except: 1222 # Format not available 1223 log.error(f"Input file format '{self.input_format}' not available") 1224 raise ValueError( 1225 f"Input file format '{self.input_format}' not available" 1226 ) 1227 1228 # SQLite connexion 1229 elif connexion_format in ["sqlite"] and input_format in [ 1230 "vcf", 1231 "tsv", 1232 "csv", 1233 "psv", 1234 ]: 1235 1236 # Main structure 1237 structure = { 1238 "#CHROM": "VARCHAR", 1239 "POS": "INTEGER", 1240 "ID": "VARCHAR", 1241 "REF": "VARCHAR", 1242 "ALT": "VARCHAR", 1243 "QUAL": "VARCHAR", 1244 "FILTER": "VARCHAR", 1245 "INFO": "VARCHAR", 1246 } 1247 1248 # Strcuture with samples 1249 structure_complete = structure 1250 if self.get_header_sample_list(): 1251 structure["FORMAT"] = "VARCHAR" 1252 for sample in self.get_header_sample_list(): 1253 structure_complete[sample] = "VARCHAR" 1254 1255 # Columns list for create and insert 1256 sql_create_table_columns = [] 1257 sql_create_table_columns_list = [] 1258 for column in structure_complete: 1259 column_type = structure_complete[column] 1260 sql_create_table_columns.append( 1261 f'"{column}" {column_type} default NULL' 1262 ) 1263 sql_create_table_columns_list.append(f'"{column}"') 1264 1265 # Create database 1266 log.debug(f"Create Table {table_variants}") 1267 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1268 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1269 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1270 self.conn.execute(sql_create_table) 1271 1272 # chunksize define length of file chunk load file 1273 chunksize = 100000 1274 1275 # delimiter 1276 delimiter = file_format_delimiters.get(input_format, "\t") 1277 1278 # Load the input file 1279 with open(self.input, "rt") as input_file: 1280 1281 # Use the appropriate file handler based on the input format 1282 if input_compressed: 1283 input_file = bgzf.open(self.input, "rt") 1284 if input_format in ["vcf"]: 1285 header_len = self.get_header_length() 1286 else: 1287 header_len = 0 1288 1289 # Insert the file contents into a table 1290 self.insert_file_to_table( 1291 input_file, 1292 columns=sql_create_table_columns_list_sql, 1293 header_len=header_len, 1294 sep=delimiter, 1295 chunksize=chunksize, 1296 ) 1297 1298 else: 1299 log.error( 1300 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1301 ) 1302 raise ValueError( 1303 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1304 ) 1305 1306 # Explode INFOS fields into table fields 1307 if self.get_explode_infos(): 1308 self.explode_infos( 1309 prefix=self.get_explode_infos_prefix(), 1310 fields=self.get_explode_infos_fields(), 1311 force=True, 1312 ) 1313 1314 # Create index after insertion 1315 self.create_indexes()
The load_data function reads a VCF file and inserts it into a table, with options to drop the
table before loading the data and specify a sample size.
Parameters
- input_file: The path to the input file. This is the VCF file that will be loaded into the table
- drop_variants_table: The
drop_variants_tableparameter is a boolean flag that determines whether the variants table should be dropped before loading the data. If set toTrue, the variants table will be dropped. If set toFalse(default), the variants table will not be dropped, defaults to False - sample_size: The
sample_sizeparameter determines the number of rows to be sampled from the input file. If it is set toNone, the default value of 20480 will be used, defaults to 20480
1317 def get_explode_infos(self) -> bool: 1318 """ 1319 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1320 to False if it is not set. 1321 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1322 value. If the parameter is not present, it will return False. 1323 """ 1324 1325 return self.get_param().get("explode", {}).get("explode_infos", False)
The function get_explode_infos returns the value of the "explode_infos" parameter, defaulting
to False if it is not set.
Returns
The method is returning the value of the "explode_infos" parameter, which is a boolean value. If the parameter is not present, it will return False.
1327 def get_explode_infos_fields( 1328 self, 1329 explode_infos_fields: str = None, 1330 remove_fields_not_in_header: bool = False, 1331 ) -> list: 1332 """ 1333 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1334 the input parameter `explode_infos_fields`. 1335 1336 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1337 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1338 comma-separated list of field names to explode 1339 :type explode_infos_fields: str 1340 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1341 flag that determines whether to remove fields that are not present in the header. If it is set 1342 to `True`, any field that is not in the header will be excluded from the list of exploded 1343 information fields. If it is set to `, defaults to False 1344 :type remove_fields_not_in_header: bool (optional) 1345 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1346 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1347 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1348 Otherwise, it returns a list of exploded information fields after removing any spaces and 1349 splitting the string by commas. 1350 """ 1351 1352 # If no fields, get it in param 1353 if not explode_infos_fields: 1354 explode_infos_fields = ( 1355 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1356 ) 1357 1358 # If no fields, defined as all fields in header using keyword 1359 if not explode_infos_fields: 1360 explode_infos_fields = "*" 1361 1362 # If fields list not empty 1363 if explode_infos_fields: 1364 1365 # Input fields list 1366 if isinstance(explode_infos_fields, str): 1367 fields_input = explode_infos_fields.split(",") 1368 elif isinstance(explode_infos_fields, list): 1369 fields_input = explode_infos_fields 1370 else: 1371 fields_input = [] 1372 1373 # Fields list without * keyword 1374 fields_without_all = fields_input.copy() 1375 if "*".casefold() in (item.casefold() for item in fields_without_all): 1376 fields_without_all.remove("*") 1377 1378 # Fields in header 1379 fields_in_header = sorted(list(set(self.get_header().infos))) 1380 1381 # Construct list of fields 1382 fields_output = [] 1383 for field in fields_input: 1384 1385 # Strip field 1386 field = field.strip() 1387 1388 # format keyword * in regex 1389 if field.upper() in ["*"]: 1390 field = ".*" 1391 1392 # Find all fields with pattern 1393 r = re.compile(field) 1394 fields_search = sorted(list(filter(r.match, fields_in_header))) 1395 1396 # Remove fields input from search 1397 if fields_search != [field]: 1398 fields_search = sorted( 1399 list(set(fields_search).difference(fields_input)) 1400 ) 1401 1402 # If field is not in header (avoid not well formatted header) 1403 if not fields_search and not remove_fields_not_in_header: 1404 fields_search = [field] 1405 1406 # Add found fields 1407 for new_field in fields_search: 1408 # Add field, if not already exists, and if it is in header (if asked) 1409 if ( 1410 new_field not in fields_output 1411 and ( 1412 not remove_fields_not_in_header 1413 or new_field in fields_in_header 1414 ) 1415 and new_field not in [".*"] 1416 ): 1417 fields_output.append(new_field) 1418 1419 return fields_output 1420 1421 else: 1422 1423 return []
The get_explode_infos_fields function returns a list of exploded information fields based on
the input parameter explode_infos_fields.
Parameters
- explode_infos_fields: The
explode_infos_fieldsparameter is a string that specifies the fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a comma-separated list of field names to explode - remove_fields_not_in_header: The parameter
remove_fields_not_in_headeris a boolean flag that determines whether to remove fields that are not present in the header. If it is set toTrue, any field that is not in the header will be excluded from the list of exploded information fields. If it is set to `, defaults to False
Returns
The function
get_explode_infos_fieldsreturns a list of exploded information fields. If theexplode_infos_fieldsparameter is not provided or is set to None, it returns an empty list. If the parameter is provided and its value is "ALL", it also returns an empty list. Otherwise, it returns a list of exploded information fields after removing any spaces and splitting the string by commas.
1425 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1426 """ 1427 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1428 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1429 not provided. 1430 1431 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1432 prefix to be used for exploding or expanding information 1433 :type explode_infos_prefix: str 1434 :return: the value of the variable `explode_infos_prefix`. 1435 """ 1436 1437 if not explode_infos_prefix: 1438 explode_infos_prefix = ( 1439 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1440 ) 1441 1442 return explode_infos_prefix
The function get_explode_infos_prefix returns the value of the explode_infos_prefix parameter, or
the value of self.get_param().get("explode_infos_prefix", None) if explode_infos_prefix is
not provided.
Parameters
- explode_infos_prefix: The parameter
explode_infos_prefixis a string that specifies a prefix to be used for exploding or expanding information
Returns
the value of the variable
explode_infos_prefix.
1444 def add_column( 1445 self, 1446 table_name, 1447 column_name, 1448 column_type, 1449 default_value=None, 1450 drop: bool = False, 1451 ) -> dict: 1452 """ 1453 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1454 doesn't already exist. 1455 1456 :param table_name: The name of the table to which you want to add a column 1457 :param column_name: The parameter "column_name" is the name of the column that you want to add 1458 to the table 1459 :param column_type: The `column_type` parameter specifies the data type of the column that you 1460 want to add to the table. It should be a string that represents the desired data type, such as 1461 "INTEGER", "TEXT", "REAL", etc 1462 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1463 default value for the newly added column. If a default value is provided, it will be assigned to 1464 the column for any existing rows that do not have a value for that column 1465 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1466 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1467 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1468 to False 1469 :type drop: bool (optional) 1470 :return: a boolean value indicating whether the column was successfully added to the table. 1471 """ 1472 1473 # added 1474 added = False 1475 dropped = False 1476 1477 # Check if the column already exists in the table 1478 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1479 columns = self.get_query_to_df(query).columns.tolist() 1480 if column_name in columns: 1481 log.debug( 1482 f"The {column_name} column already exists in the {table_name} table" 1483 ) 1484 if drop: 1485 self.drop_column(table_name=table_name, column_name=column_name) 1486 dropped = True 1487 else: 1488 return None 1489 else: 1490 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1491 1492 # Add column in table 1493 add_column_query = ( 1494 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1495 ) 1496 if default_value is not None: 1497 add_column_query += f" DEFAULT {default_value}" 1498 self.execute_query(add_column_query) 1499 added = not dropped 1500 log.debug( 1501 f"The {column_name} column was successfully added to the {table_name} table" 1502 ) 1503 1504 if added: 1505 added_column = { 1506 "table_name": table_name, 1507 "column_name": column_name, 1508 "column_type": column_type, 1509 "default_value": default_value, 1510 } 1511 else: 1512 added_column = None 1513 1514 return added_column
The add_column function adds a column to a SQLite or DuckDB table with a default value if it
doesn't already exist.
Parameters
- table_name: The name of the table to which you want to add a column
- column_name: The parameter "column_name" is the name of the column that you want to add to the table
- column_type: The
column_typeparameter specifies the data type of the column that you want to add to the table. It should be a string that represents the desired data type, such as "INTEGER", "TEXT", "REAL", etc - default_value: The
default_valueparameter is an optional parameter that specifies the default value for the newly added column. If a default value is provided, it will be assigned to the column for any existing rows that do not have a value for that column - drop: The
dropparameter is a boolean flag that determines whether to drop the column if it already exists in the table. Ifdropis set toTrue, the function will drop the existing column before adding the new column. Ifdropis set toFalse(default),, defaults to False
Returns
a boolean value indicating whether the column was successfully added to the table.
1516 def drop_column( 1517 self, column: dict = None, table_name: str = None, column_name: str = None 1518 ) -> bool: 1519 """ 1520 The `drop_column` function drops a specified column from a given table in a database and returns 1521 True if the column was successfully dropped, and False if the column does not exist in the 1522 table. 1523 1524 :param column: The `column` parameter is a dictionary that contains information about the column 1525 you want to drop. It has two keys: 1526 :type column: dict 1527 :param table_name: The `table_name` parameter is the name of the table from which you want to 1528 drop a column 1529 :type table_name: str 1530 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1531 from the table 1532 :type column_name: str 1533 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1534 and False if the column does not exist in the table. 1535 """ 1536 1537 # Find column infos 1538 if column: 1539 if isinstance(column, dict): 1540 table_name = column.get("table_name", None) 1541 column_name = column.get("column_name", None) 1542 elif isinstance(column, str): 1543 table_name = self.get_table_variants() 1544 column_name = column 1545 else: 1546 table_name = None 1547 column_name = None 1548 1549 if not table_name and not column_name: 1550 return False 1551 1552 # Removed 1553 removed = False 1554 1555 # Check if the column already exists in the table 1556 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1557 columns = self.get_query_to_df(query).columns.tolist() 1558 if column_name in columns: 1559 log.debug(f"The {column_name} column exists in the {table_name} table") 1560 else: 1561 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1562 return False 1563 1564 # Add column in table # ALTER TABLE integers DROP k 1565 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1566 self.execute_query(add_column_query) 1567 removed = True 1568 log.debug( 1569 f"The {column_name} column was successfully dropped to the {table_name} table" 1570 ) 1571 1572 return removed
The drop_column function drops a specified column from a given table in a database and returns
True if the column was successfully dropped, and False if the column does not exist in the
table.
Parameters
- column: The
columnparameter is a dictionary that contains information about the column you want to drop. It has two keys: - table_name: The
table_nameparameter is the name of the table from which you want to drop a column - column_name: The
column_nameparameter is the name of the column that you want to drop from the table
Returns
a boolean value. It returns True if the column was successfully dropped from the table, and False if the column does not exist in the table.
1574 def explode_infos( 1575 self, 1576 prefix: str = None, 1577 create_index: bool = False, 1578 fields: list = None, 1579 force: bool = False, 1580 proccess_all_fields_together: bool = False, 1581 ) -> list: 1582 """ 1583 The `explode_infos` function takes a VCF file and explodes the INFO fields into individual 1584 columns, returning a list of added columns. 1585 1586 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1587 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1588 `self.get_explode_infos_prefix()` as the prefix 1589 :type prefix: str 1590 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1591 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1592 `False`, indexes will not be created. The default value is `False`, defaults to False 1593 :type create_index: bool (optional) 1594 :param fields: The `fields` parameter is a list of INFO fields that you want to explode into 1595 individual columns. If this parameter is not provided, all INFO fields will be exploded 1596 :type fields: list 1597 :param force: The `force` parameter is a boolean flag that determines whether to drop and 1598 recreate the column if it already exists in the table. If `force` is set to `True`, the column 1599 will be dropped and recreated. If `force` is set to `False`, the column will not be dropped, 1600 defaults to False 1601 :type force: bool (optional) 1602 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1603 flag that determines whether to process all the INFO fields together or individually. If set to 1604 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1605 be processed individually, defaults to False 1606 :type proccess_all_fields_together: bool (optional) 1607 :return: The function `explode_infos` returns a list of added columns. 1608 """ 1609 1610 # drop indexes 1611 self.drop_indexes() 1612 1613 # connexion format 1614 connexion_format = self.get_connexion_format() 1615 1616 # Access 1617 access = self.get_config().get("access", None) 1618 1619 # Added columns 1620 added_columns = [] 1621 1622 if access not in ["RO"]: 1623 1624 # prefix 1625 if prefix in [None, True] or not isinstance(prefix, str): 1626 if self.get_explode_infos_prefix() not in [None, True]: 1627 prefix = self.get_explode_infos_prefix() 1628 else: 1629 prefix = "INFO/" 1630 1631 # table variants 1632 table_variants = self.get_table_variants(clause="select") 1633 1634 # extra infos 1635 try: 1636 extra_infos = self.get_extra_infos() 1637 except: 1638 extra_infos = [] 1639 1640 # Header infos 1641 header_infos = self.get_header().infos 1642 1643 log.debug( 1644 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1645 ) 1646 1647 sql_info_alter_table_array = [] 1648 1649 # Info fields to check 1650 fields_list = list(header_infos) 1651 if fields: 1652 fields_list += fields 1653 fields_list = set(fields_list) 1654 1655 # If no fields 1656 if not fields: 1657 fields = [] 1658 1659 # Translate fields if patterns 1660 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1661 1662 for info in fields: 1663 1664 info_id_sql = prefix + info 1665 1666 if ( 1667 info in fields_list 1668 or prefix + info in fields_list 1669 or info in extra_infos 1670 ): 1671 1672 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1673 1674 if info in header_infos: 1675 info_type = header_infos[info].type 1676 info_num = header_infos[info].num 1677 else: 1678 info_type = "String" 1679 info_num = 0 1680 1681 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1682 if info_num != 1: 1683 type_sql = "VARCHAR" 1684 1685 # Add field 1686 added_column = self.add_column( 1687 table_name=table_variants, 1688 column_name=info_id_sql, 1689 column_type=type_sql, 1690 default_value="null", 1691 drop=force, 1692 ) 1693 1694 if added_column: 1695 added_columns.append(added_column) 1696 1697 if added_column or force: 1698 1699 # add field to index 1700 self.index_additionnal_fields.append(info_id_sql) 1701 1702 # Update field array 1703 if connexion_format in ["duckdb"]: 1704 update_info_field = f""" 1705 "{info_id_sql}" = 1706 CASE 1707 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1708 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1709 END 1710 """ 1711 elif connexion_format in ["sqlite"]: 1712 update_info_field = f""" 1713 "{info_id_sql}" = 1714 CASE 1715 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1716 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1717 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1718 END 1719 """ 1720 1721 sql_info_alter_table_array.append(update_info_field) 1722 1723 if sql_info_alter_table_array: 1724 1725 # By chromosomes 1726 try: 1727 chromosomes_list = list( 1728 self.get_query_to_df( 1729 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1730 )["#CHROM"] 1731 ) 1732 except: 1733 chromosomes_list = [None] 1734 1735 for chrom in chromosomes_list: 1736 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1737 1738 # Where clause 1739 where_clause = "" 1740 if chrom and len(chromosomes_list) > 1: 1741 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1742 1743 # Update table 1744 if proccess_all_fields_together: 1745 sql_info_alter_table_array_join = ", ".join( 1746 sql_info_alter_table_array 1747 ) 1748 if sql_info_alter_table_array_join: 1749 sql_info_alter_table = f""" 1750 UPDATE {table_variants} 1751 SET {sql_info_alter_table_array_join} 1752 {where_clause} 1753 """ 1754 log.debug( 1755 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1756 ) 1757 # log.debug(sql_info_alter_table) 1758 self.conn.execute(sql_info_alter_table) 1759 else: 1760 sql_info_alter_num = 0 1761 for sql_info_alter in sql_info_alter_table_array: 1762 sql_info_alter_num += 1 1763 sql_info_alter_table = f""" 1764 UPDATE {table_variants} 1765 SET {sql_info_alter} 1766 {where_clause} 1767 """ 1768 log.debug( 1769 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1770 ) 1771 # log.debug(sql_info_alter_table) 1772 self.conn.execute(sql_info_alter_table) 1773 1774 # create indexes 1775 if create_index: 1776 self.create_indexes() 1777 1778 return added_columns
The explode_infos function takes a VCF file and explodes the INFO fields into individual
columns, returning a list of added columns.
Parameters
- prefix: The
prefixparameter is a string that is used as a prefix for the exploded INFO fields. If theprefixis not provided or is set toNone, the function will use the value ofself.get_explode_infos_prefix()as the prefix - create_index: The
create_indexparameter is a boolean flag that specifies whether to create indexes on the exploded INFO fields. If set toTrue, indexes will be created; if set toFalse, indexes will not be created. The default value isFalse, defaults to False - fields: The
fieldsparameter is a list of INFO fields that you want to explode into individual columns. If this parameter is not provided, all INFO fields will be exploded - force: The
forceparameter is a boolean flag that determines whether to drop and recreate the column if it already exists in the table. Ifforceis set toTrue, the column will be dropped and recreated. Ifforceis set toFalse, the column will not be dropped, defaults to False - proccess_all_fields_together: The
proccess_all_fields_togetherparameter is a boolean flag that determines whether to process all the INFO fields together or individually. If set toTrue, all the INFO fields will be processed together. If set toFalse, each INFO field will be processed individually, defaults to False
Returns
The function
explode_infosreturns a list of added columns.
1780 def create_indexes(self) -> None: 1781 """ 1782 Create indexes on the table after insertion 1783 """ 1784 1785 # Access 1786 access = self.get_config().get("access", None) 1787 1788 # get table variants 1789 table_variants = self.get_table_variants("FROM") 1790 1791 if self.get_indexing() and access not in ["RO"]: 1792 # Create index 1793 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 1794 self.conn.execute(sql_create_table_index) 1795 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 1796 self.conn.execute(sql_create_table_index) 1797 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 1798 self.conn.execute(sql_create_table_index) 1799 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 1800 self.conn.execute(sql_create_table_index) 1801 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 1802 self.conn.execute(sql_create_table_index) 1803 for field in self.index_additionnal_fields: 1804 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 1805 self.conn.execute(sql_create_table_index)
Create indexes on the table after insertion
1807 def drop_indexes(self) -> None: 1808 """ 1809 Create indexes on the table after insertion 1810 """ 1811 1812 # Access 1813 access = self.get_config().get("access", None) 1814 1815 # get table variants 1816 table_variants = self.get_table_variants("FROM") 1817 1818 # Get database format 1819 connexion_format = self.get_connexion_format() 1820 1821 if access not in ["RO"]: 1822 if connexion_format in ["duckdb"]: 1823 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 1824 elif connexion_format in ["sqlite"]: 1825 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 1826 1827 list_indexes = self.conn.execute(sql_list_indexes) 1828 index_names = [row[0] for row in list_indexes.fetchall()] 1829 for index in index_names: 1830 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 1831 self.conn.execute(sql_drop_table_index)
Create indexes on the table after insertion
1833 def read_vcf_header(self, f) -> list: 1834 """ 1835 It reads the header of a VCF file and returns a list of the header lines 1836 1837 :param f: the file object 1838 :return: The header lines of the VCF file. 1839 """ 1840 1841 header_list = [] 1842 for line in f: 1843 header_list.append(line) 1844 if line.startswith("#CHROM"): 1845 break 1846 return header_list
It reads the header of a VCF file and returns a list of the header lines
Parameters
- f: the file object
Returns
The header lines of the VCF file.
1848 def read_vcf_header_file(self, file: str = None) -> list: 1849 """ 1850 The function `read_vcf_header_file` reads the header of a VCF file, either from a compressed or 1851 uncompressed file. 1852 1853 :param file: The `file` parameter is a string that represents the path to the VCF header file 1854 that you want to read. It is an optional parameter, so if you don't provide a value, it will 1855 default to `None` 1856 :type file: str 1857 :param compressed: The `compressed` parameter is a boolean flag that indicates whether the VCF 1858 file is compressed or not. If `compressed` is set to `True`, it means that the VCF file is 1859 compressed using the BGZF compression format. If `compressed` is set to `False`, it means that, 1860 defaults to False 1861 :type compressed: bool (optional) 1862 :return: a list. 1863 """ 1864 1865 if self.get_input_compressed(input_file=file): 1866 with bgzf.open(file, "rt") as f: 1867 return self.read_vcf_header(f=f) 1868 else: 1869 with open(file, "rt") as f: 1870 return self.read_vcf_header(f=f)
The function read_vcf_header_file reads the header of a VCF file, either from a compressed or
uncompressed file.
Parameters
- file: The
fileparameter is a string that represents the path to the VCF header file that you want to read. It is an optional parameter, so if you don't provide a value, it will default toNone - compressed: The
compressedparameter is a boolean flag that indicates whether the VCF file is compressed or not. Ifcompressedis set toTrue, it means that the VCF file is compressed using the BGZF compression format. Ifcompressedis set toFalse, it means that, defaults to False
Returns
a list.
1872 def execute_query(self, query: str): 1873 """ 1874 It takes a query as an argument, executes it, and returns the results 1875 1876 :param query: The query to be executed 1877 :return: The result of the query is being returned. 1878 """ 1879 if query: 1880 return self.conn.execute(query) # .fetchall() 1881 else: 1882 return None
It takes a query as an argument, executes it, and returns the results
Parameters
- query: The query to be executed
Returns
The result of the query is being returned.
1884 def export_output( 1885 self, 1886 output_file: str | None = None, 1887 output_header: str | None = None, 1888 export_header: bool = True, 1889 query: str | None = None, 1890 parquet_partitions: list | None = None, 1891 chunk_size: int | None = None, 1892 threads: int | None = None, 1893 sort: bool = False, 1894 index: bool = False, 1895 order_by: str | None = None, 1896 ) -> bool: 1897 """ 1898 The `export_output` function exports data from a VCF file to a specified output file in various 1899 formats, including VCF, CSV, TSV, PSV, and Parquet. 1900 1901 :param output_file: The `output_file` parameter is a string that specifies the name of the 1902 output file to be generated by the function. This is where the exported data will be saved 1903 :type output_file: str 1904 :param output_header: The `output_header` parameter is a string that specifies the name of the 1905 file where the header of the VCF file will be exported. If this parameter is not provided, the 1906 header will be exported to a file with the same name as the `output_file` parameter, but with 1907 the extension " 1908 :type output_header: str 1909 :param export_header: The `export_header` parameter is a boolean flag that determines whether 1910 the header of a VCF file should be exported to a separate file or not. If `export_header` is 1911 True, the header will be exported to a file. If `export_header` is False, the header will not 1912 be, defaults to True, if output format is not VCF 1913 :type export_header: bool (optional) 1914 :param query: The `query` parameter is an optional SQL query that can be used to filter and 1915 select specific data from the VCF file before exporting it. If provided, only the data that 1916 matches the query will be exported 1917 :type query: str 1918 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 1919 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 1920 organize data in a hierarchical directory structure based on the values of one or more columns. 1921 This can improve query performance when working with large datasets 1922 :type parquet_partitions: list 1923 :param chunk_size: The `chunk_size` parameter specifies the number of 1924 records in batch when exporting data in Parquet format. This parameter is used for 1925 partitioning the Parquet file into multiple files. 1926 :type chunk_size: int 1927 :param threads: The `threads` parameter is an optional parameter that specifies the number of 1928 threads to be used during the export process. It determines the level of parallelism and can 1929 improve the performance of the export operation. If not provided, the function will use the 1930 default number of threads 1931 :type threads: int 1932 :param sort: The `sort` parameter is a boolean flag that determines whether the output file 1933 should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the 1934 genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to 1935 False 1936 :type sort: bool (optional) 1937 :param index: The `index` parameter is a boolean flag that determines whether an index should be 1938 created on the output file. If `index` is True, an index will be created. If `index` is False, 1939 no index will be created. The default value is False, defaults to False 1940 :type index: bool (optional) 1941 :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for 1942 sorting the output file. This parameter is only applicable when exporting data in VCF format 1943 :type order_by: str 1944 :return: a boolean value. It checks if the output file exists and returns True if it does, or 1945 None if it doesn't. 1946 """ 1947 1948 # Log 1949 log.info("Exporting...") 1950 1951 # Full path 1952 output_file = full_path(output_file) 1953 output_header = full_path(output_header) 1954 1955 # Config 1956 config = self.get_config() 1957 1958 # Param 1959 param = self.get_param() 1960 1961 # Tmp files to remove 1962 tmp_to_remove = [] 1963 1964 # If no output, get it 1965 if not output_file: 1966 output_file = self.get_output() 1967 1968 # If not threads 1969 if not threads: 1970 threads = self.get_threads() 1971 1972 # Auto header name with extension 1973 if export_header or output_header: 1974 if not output_header: 1975 output_header = f"{output_file}.hdr" 1976 # Export header 1977 self.export_header(output_file=output_file) 1978 1979 # Switch off export header if VCF output 1980 output_file_type = get_file_format(output_file) 1981 if output_file_type in ["vcf"]: 1982 export_header = False 1983 tmp_to_remove.append(output_header) 1984 1985 # Chunk size 1986 if not chunk_size: 1987 chunk_size = config.get("chunk_size", None) 1988 1989 # Parquet partition 1990 if not parquet_partitions: 1991 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 1992 if parquet_partitions and isinstance(parquet_partitions, str): 1993 parquet_partitions = parquet_partitions.split(",") 1994 1995 # Order by 1996 if not order_by: 1997 order_by = param.get("export", {}).get("order_by", "") 1998 1999 # Header in output 2000 header_in_output = param.get("export", {}).get("include_header", False) 2001 2002 # Database 2003 database_source = self.get_connexion() 2004 2005 # Connexion format 2006 connexion_format = self.get_connexion_format() 2007 2008 # Explode infos 2009 if self.get_explode_infos(): 2010 self.explode_infos( 2011 prefix=self.get_explode_infos_prefix(), 2012 fields=self.get_explode_infos_fields(), 2013 force=False, 2014 ) 2015 2016 # if connexion_format in ["sqlite"] or query: 2017 if connexion_format in ["sqlite"]: 2018 2019 # Export in Parquet 2020 random_tmp = "".join( 2021 random.choice(string.ascii_lowercase) for i in range(10) 2022 ) 2023 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2024 tmp_to_remove.append(database_source) 2025 2026 # Table Variants 2027 table_variants = self.get_table_variants() 2028 2029 # Create export query 2030 sql_query_export_subquery = f""" 2031 SELECT * FROM {table_variants} 2032 """ 2033 2034 # Write source file 2035 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2036 2037 # Create database 2038 database = Database( 2039 database=database_source, 2040 table="variants", 2041 header_file=output_header, 2042 conn_config=self.get_connexion_config(), 2043 ) 2044 2045 # Existing colomns header 2046 # existing_columns_header = database.get_header_file_columns(output_header) 2047 existing_columns_header = database.get_header_columns_from_database() 2048 2049 # Export file 2050 database.export( 2051 output_database=output_file, 2052 output_header=output_header, 2053 existing_columns_header=existing_columns_header, 2054 parquet_partitions=parquet_partitions, 2055 chunk_size=chunk_size, 2056 threads=threads, 2057 sort=sort, 2058 index=index, 2059 header_in_output=header_in_output, 2060 order_by=order_by, 2061 query=query, 2062 export_header=export_header, 2063 ) 2064 2065 # Remove 2066 remove_if_exists(tmp_to_remove) 2067 2068 return (os.path.exists(output_file) or None) and ( 2069 os.path.exists(output_file) or None 2070 )
The export_output function exports data from a VCF file to a specified output file in various
formats, including VCF, CSV, TSV, PSV, and Parquet.
Parameters
- output_file: The
output_fileparameter is a string that specifies the name of the output file to be generated by the function. This is where the exported data will be saved - output_header: The
output_headerparameter is a string that specifies the name of the file where the header of the VCF file will be exported. If this parameter is not provided, the header will be exported to a file with the same name as theoutput_fileparameter, but with the extension " - export_header: The
export_headerparameter is a boolean flag that determines whether the header of a VCF file should be exported to a separate file or not. Ifexport_headeris True, the header will be exported to a file. Ifexport_headeris False, the header will not be, defaults to True, if output format is not VCF - query: The
queryparameter is an optional SQL query that can be used to filter and select specific data from the VCF file before exporting it. If provided, only the data that matches the query will be exported - parquet_partitions: The
parquet_partitionsparameter is a list that specifies the columns to be used for partitioning the Parquet file during export. Partitioning is a way to organize data in a hierarchical directory structure based on the values of one or more columns. This can improve query performance when working with large datasets - chunk_size: The
chunk_sizeparameter specifies the number of records in batch when exporting data in Parquet format. This parameter is used for partitioning the Parquet file into multiple files. - threads: The
threadsparameter is an optional parameter that specifies the number of threads to be used during the export process. It determines the level of parallelism and can improve the performance of the export operation. If not provided, the function will use the default number of threads - sort: The
sortparameter is a boolean flag that determines whether the output file should be sorted or not. Ifsortis set toTrue, the output file will be sorted based on the genomic coordinates of the variants. By default, the value ofsortisFalse, defaults to False - index: The
indexparameter is a boolean flag that determines whether an index should be created on the output file. Ifindexis True, an index will be created. Ifindexis False, no index will be created. The default value is False, defaults to False - order_by: The
order_byparameter is a string that specifies the column(s) to use for sorting the output file. This parameter is only applicable when exporting data in VCF format
Returns
a boolean value. It checks if the output file exists and returns True if it does, or None if it doesn't.
2072 def get_extra_infos(self, table: str = None) -> list: 2073 """ 2074 > This function returns a list of columns that are in the table but not in the header 2075 2076 The function is called `get_extra_infos` and it takes two arguments: `self` and `table`. The 2077 `self` argument is a reference to the object that called the function. The `table` argument is 2078 the name of the table that we want to get the extra columns from 2079 2080 :param table: The table to get the extra columns from. If not specified, it will use the 2081 variants table 2082 :param format: The format of the output. If it's "sql", it will return a string of the extra 2083 columns separated by commas. If it's "list", it will return a list of the extra columns 2084 :return: A list of columns that are in the table but not in the header 2085 """ 2086 2087 header_columns = [] 2088 2089 if not table: 2090 table = self.get_table_variants(clause="from") 2091 header_columns = self.get_header_columns() 2092 2093 # Check all columns in the database 2094 query = f""" SELECT * FROM {table} LIMIT 1 """ 2095 log.debug(f"query {query}") 2096 table_columns = self.get_query_to_df(query).columns.tolist() 2097 extra_columns = [] 2098 2099 # Construct extra infos (not in header) 2100 for column in table_columns: 2101 if column not in header_columns: 2102 extra_columns.append(column) 2103 2104 return extra_columns
This function returns a list of columns that are in the table but not in the header
The function is called get_extra_infos and it takes two arguments: self and table. The
self argument is a reference to the object that called the function. The table argument is
the name of the table that we want to get the extra columns from
Parameters
- table: The table to get the extra columns from. If not specified, it will use the variants table
- format: The format of the output. If it's "sql", it will return a string of the extra columns separated by commas. If it's "list", it will return a list of the extra columns
Returns
A list of columns that are in the table but not in the header
2106 def get_extra_infos_sql(self, table: str = None) -> str: 2107 """ 2108 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2109 by double quotes 2110 2111 :param table: The name of the table to get the extra infos from. If None, the default table is 2112 used 2113 :type table: str 2114 :return: A string of the extra infos 2115 """ 2116 2117 return ", ".join( 2118 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2119 )
It returns a string of the extra infos, separated by commas, and each extra info is surrounded by double quotes
Parameters
- table: The name of the table to get the extra infos from. If None, the default table is used
Returns
A string of the extra infos
2121 def export_header( 2122 self, 2123 header_name: str = None, 2124 output_file: str = None, 2125 output_file_ext: str = ".hdr", 2126 clean_header: bool = True, 2127 remove_chrom_line: bool = False, 2128 ) -> str: 2129 """ 2130 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2131 specified options, and writes it to a new file. 2132 2133 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2134 this parameter is not specified, the header will be written to the output file 2135 :type header_name: str 2136 :param output_file: The `output_file` parameter in the `export_header` function is used to 2137 specify the name of the output file where the header will be written. If this parameter is not 2138 provided, the header will be written to a temporary file 2139 :type output_file: str 2140 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2141 string that represents the extension of the output header file. By default, it is set to ".hdr" 2142 if not specified by the user. This extension will be appended to the `output_file` name to 2143 create the final, defaults to .hdr 2144 :type output_file_ext: str (optional) 2145 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2146 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2147 `True`, the function will clean the header by modifying certain lines based on a specific 2148 pattern. If `clean_header`, defaults to True 2149 :type clean_header: bool (optional) 2150 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2151 boolean flag that determines whether the #CHROM line should be removed from the header before 2152 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2153 defaults to False 2154 :type remove_chrom_line: bool (optional) 2155 :return: The function `export_header` returns the name of the temporary header file that is 2156 created. 2157 """ 2158 2159 if not header_name and not output_file: 2160 output_file = self.get_output() 2161 2162 if self.get_header(): 2163 2164 # Get header object 2165 header_obj = self.get_header() 2166 2167 # Create database 2168 db_for_header = Database(database=self.get_input()) 2169 2170 # Get real columns in the file 2171 db_header_columns = db_for_header.get_columns() 2172 2173 with tempfile.TemporaryDirectory() as tmpdir: 2174 2175 # Write header file 2176 header_file_tmp = os.path.join(tmpdir, "header") 2177 f = open(header_file_tmp, "w") 2178 vcf.Writer(f, header_obj) 2179 f.close() 2180 2181 # Replace #CHROM line with rel columns 2182 header_list = db_for_header.read_header_file( 2183 header_file=header_file_tmp 2184 ) 2185 header_list[-1] = "\t".join(db_header_columns) 2186 2187 # Remove CHROM line 2188 if remove_chrom_line: 2189 header_list.pop() 2190 2191 # Clean header 2192 if clean_header: 2193 header_list_clean = [] 2194 for head in header_list: 2195 # Clean head for malformed header 2196 head_clean = head 2197 head_clean = re.subn( 2198 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2199 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2200 head_clean, 2201 2, 2202 )[0] 2203 # Write header 2204 header_list_clean.append(head_clean) 2205 header_list = header_list_clean 2206 2207 tmp_header_name = output_file + output_file_ext 2208 2209 f = open(tmp_header_name, "w") 2210 for line in header_list: 2211 f.write(line) 2212 f.close() 2213 2214 return tmp_header_name
The export_header function takes a VCF file, extracts the header, modifies it according to
specified options, and writes it to a new file.
Parameters
- header_name: The
header_nameparameter is the name of the header file to be created. If this parameter is not specified, the header will be written to the output file - output_file: The
output_fileparameter in theexport_headerfunction is used to specify the name of the output file where the header will be written. If this parameter is not provided, the header will be written to a temporary file - output_file_ext: The
output_file_extparameter in theexport_headerfunction is a string that represents the extension of the output header file. By default, it is set to ".hdr" if not specified by the user. This extension will be appended to theoutput_filename to create the final, defaults to .hdr - clean_header: The
clean_headerparameter in theexport_headerfunction is a boolean flag that determines whether the header should be cleaned or not. Whenclean_headeris set toTrue, the function will clean the header by modifying certain lines based on a specific pattern. Ifclean_header, defaults to True - remove_chrom_line: The
remove_chrom_lineparameter in theexport_headerfunction is a boolean flag that determines whether the #CHROM line should be removed from the header before writing it to the output file. If set toTrue, the #CHROM line will be removed; if set to `, defaults to False
Returns
The function
export_headerreturns the name of the temporary header file that is created.
2216 def export_variant_vcf( 2217 self, 2218 vcf_file, 2219 remove_info: bool = False, 2220 add_samples: bool = True, 2221 list_samples: list = [], 2222 index: bool = False, 2223 threads: int | None = None, 2224 ) -> bool | None: 2225 """ 2226 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2227 remove INFO field, add samples, and control compression and indexing. 2228 2229 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2230 written to. It is the output file that will contain the filtered VCF data based on the specified 2231 parameters 2232 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2233 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2234 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2235 in, defaults to False 2236 :type remove_info: bool (optional) 2237 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2238 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2239 If set to False, the samples will be removed. The default value is True, defaults to True 2240 :type add_samples: bool (optional) 2241 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2242 in the output VCF file. By default, all samples will be included. If you provide a list of 2243 samples, only those samples will be included in the output file 2244 :type list_samples: list 2245 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2246 determines whether or not to create an index for the output VCF file. If `index` is set to 2247 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2248 :type index: bool (optional) 2249 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2250 number of threads to use for exporting the VCF file. It determines how many parallel threads 2251 will be used during the export process. More threads can potentially speed up the export process 2252 by utilizing multiple cores of the processor. If 2253 :type threads: int | None 2254 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2255 method with various parameters including the output file, query, threads, sort flag, and index 2256 flag. The `export_output` method is responsible for exporting the VCF data based on the 2257 specified parameters and configurations provided in the `export_variant_vcf` function. 2258 """ 2259 2260 # Config 2261 config = self.get_config() 2262 2263 # Extract VCF 2264 log.debug("Export VCF...") 2265 2266 # Table variants 2267 table_variants = self.get_table_variants() 2268 2269 # Threads 2270 if not threads: 2271 threads = self.get_threads() 2272 2273 # Info fields 2274 if remove_info: 2275 if not isinstance(remove_info, str): 2276 remove_info = "." 2277 info_field = f"""'{remove_info}' as INFO""" 2278 else: 2279 info_field = "INFO" 2280 2281 # Samples fields 2282 if add_samples: 2283 if not list_samples: 2284 list_samples = self.get_header_sample_list() 2285 if list_samples: 2286 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2287 else: 2288 samples_fields = "" 2289 log.debug(f"samples_fields: {samples_fields}") 2290 else: 2291 samples_fields = "" 2292 2293 # Variants 2294 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2295 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} """ 2296 2297 return self.export_output( 2298 output_file=vcf_file, 2299 output_header=None, 2300 export_header=True, 2301 query=sql_query_select, 2302 parquet_partitions=None, 2303 chunk_size=config.get("chunk_size", None), 2304 threads=threads, 2305 sort=True, 2306 index=index, 2307 order_by=None, 2308 )
The export_variant_vcf function exports a VCF file with specified samples, allowing options to
remove INFO field, add samples, and control compression and indexing.
Parameters
- vcf_file: The
vcf_fileparameter is the name of the file where the VCF data will be written to. It is the output file that will contain the filtered VCF data based on the specified parameters - remove_info: The
remove_infoparameter in theexport_variant_vcffunction is a boolean flag that determines whether to remove the INFO field from the output VCF file. If set toTrue, the INFO field will be removed. If set toFalse, the INFO field will be included in, defaults to False - add_samples: The
add_samplesparameter is a boolean parameter that determines whether the samples should be added to the VCF file or not. If set to True, the samples will be added. If set to False, the samples will be removed. The default value is True, defaults to True - list_samples: The
list_samplesparameter is a list of samples that you want to include in the output VCF file. By default, all samples will be included. If you provide a list of samples, only those samples will be included in the output file - index: The
indexparameter in theexport_variant_vcffunction is a boolean flag that determines whether or not to create an index for the output VCF file. Ifindexis set toTrue, the output VCF file will be indexed using tabix. Ifindex, defaults to False - threads: The
threadsparameter in theexport_variant_vcffunction specifies the number of threads to use for exporting the VCF file. It determines how many parallel threads will be used during the export process. More threads can potentially speed up the export process by utilizing multiple cores of the processor. If
Returns
The
export_variant_vcffunction returns the result of calling theexport_outputmethod with various parameters including the output file, query, threads, sort flag, and index flag. Theexport_outputmethod is responsible for exporting the VCF data based on the specified parameters and configurations provided in theexport_variant_vcffunction.
2310 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2311 """ 2312 It takes a list of commands and runs them in parallel using the number of threads specified 2313 2314 :param commands: A list of commands to run 2315 :param threads: The number of threads to use, defaults to 1 (optional) 2316 """ 2317 2318 run_parallel_commands(commands, threads)
It takes a list of commands and runs them in parallel using the number of threads specified
Parameters
- commands: A list of commands to run
- threads: The number of threads to use, defaults to 1 (optional)
2320 def get_threads(self, default: int = 1) -> int: 2321 """ 2322 This function returns the number of threads to use for a job, with a default value of 1 if not 2323 specified. 2324 2325 :param default: The `default` parameter in the `get_threads` method is used to specify the 2326 default number of threads to use if no specific value is provided. If no value is provided for 2327 the `threads` parameter in the configuration or input parameters, the `default` value will be 2328 used, defaults to 1 2329 :type default: int (optional) 2330 :return: the number of threads to use for the current job. 2331 """ 2332 2333 # Config 2334 config = self.get_config() 2335 2336 # Param 2337 param = self.get_param() 2338 2339 # Input threads 2340 input_thread = param.get("threads", config.get("threads", None)) 2341 2342 # Check threads 2343 if not input_thread: 2344 threads = default 2345 elif int(input_thread) <= 0: 2346 threads = os.cpu_count() 2347 else: 2348 threads = int(input_thread) 2349 return threads
This function returns the number of threads to use for a job, with a default value of 1 if not specified.
Parameters
- default: The
defaultparameter in theget_threadsmethod is used to specify the default number of threads to use if no specific value is provided. If no value is provided for thethreadsparameter in the configuration or input parameters, thedefaultvalue will be used, defaults to 1
Returns
the number of threads to use for the current job.
2351 def get_memory(self, default: str = None) -> str: 2352 """ 2353 This function retrieves the memory value from parameters or configuration with a default value 2354 if not found. 2355 2356 :param default: The `get_memory` function takes in a default value as a string parameter. This 2357 default value is used as a fallback in case the `memory` parameter is not provided in the 2358 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2359 the function 2360 :type default: str 2361 :return: The `get_memory` function returns a string value representing the memory parameter. If 2362 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2363 return the default value provided as an argument to the function. 2364 """ 2365 2366 # Config 2367 config = self.get_config() 2368 2369 # Param 2370 param = self.get_param() 2371 2372 # Input threads 2373 input_memory = param.get("memory", config.get("memory", None)) 2374 2375 # Check threads 2376 if input_memory: 2377 memory = input_memory 2378 else: 2379 memory = default 2380 2381 return memory
This function retrieves the memory value from parameters or configuration with a default value if not found.
Parameters
- default: The
get_memoryfunction takes in a default value as a string parameter. This default value is used as a fallback in case thememoryparameter is not provided in theparamdictionary or theconfigdictionary. Ifmemoryis not found in either dictionary, the function
Returns
The
get_memoryfunction returns a string value representing the memory parameter. If theinput_memoryis provided in the parameters, it will return that value. Otherwise, it will return the default value provided as an argument to the function.
2383 def update_from_vcf(self, vcf_file: str) -> None: 2384 """ 2385 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2386 2387 :param vcf_file: the path to the VCF file 2388 """ 2389 2390 connexion_format = self.get_connexion_format() 2391 2392 if connexion_format in ["duckdb"]: 2393 self.update_from_vcf_duckdb(vcf_file) 2394 elif connexion_format in ["sqlite"]: 2395 self.update_from_vcf_sqlite(vcf_file)
If the database is duckdb, then use the parquet method, otherwise use the sqlite method
Parameters
- vcf_file: the path to the VCF file
2397 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2398 """ 2399 It takes a VCF file and updates the INFO column of the variants table in the database with the 2400 INFO column of the VCF file 2401 2402 :param vcf_file: the path to the VCF file 2403 """ 2404 2405 # varaints table 2406 table_variants = self.get_table_variants() 2407 2408 # Loading VCF into temporaire table 2409 skip = self.get_header_length(file=vcf_file) 2410 vcf_df = pd.read_csv( 2411 vcf_file, 2412 sep="\t", 2413 engine="c", 2414 skiprows=skip, 2415 header=0, 2416 low_memory=False, 2417 ) 2418 sql_query_update = f""" 2419 UPDATE {table_variants} as table_variants 2420 SET INFO = concat( 2421 CASE 2422 WHEN INFO NOT IN ('', '.') 2423 THEN INFO 2424 ELSE '' 2425 END, 2426 ( 2427 SELECT 2428 concat( 2429 CASE 2430 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2431 THEN ';' 2432 ELSE '' 2433 END 2434 , 2435 CASE 2436 WHEN table_parquet.INFO NOT IN ('','.') 2437 THEN table_parquet.INFO 2438 ELSE '' 2439 END 2440 ) 2441 FROM vcf_df as table_parquet 2442 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2443 AND table_parquet.\"POS\" = table_variants.\"POS\" 2444 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2445 AND table_parquet.\"REF\" = table_variants.\"REF\" 2446 AND table_parquet.INFO NOT IN ('','.') 2447 ) 2448 ) 2449 ; 2450 """ 2451 self.conn.execute(sql_query_update)
It takes a VCF file and updates the INFO column of the variants table in the database with the INFO column of the VCF file
Parameters
- vcf_file: the path to the VCF file
2453 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2454 """ 2455 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2456 table, then updates the INFO column of the variants table with the INFO column of the temporary 2457 table 2458 2459 :param vcf_file: The path to the VCF file you want to update the database with 2460 """ 2461 2462 # Create a temporary table for the VCF 2463 table_vcf = "tmp_vcf" 2464 sql_create = ( 2465 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2466 ) 2467 self.conn.execute(sql_create) 2468 2469 # Loading VCF into temporaire table 2470 vcf_df = pd.read_csv( 2471 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2472 ) 2473 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2474 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2475 2476 # Update table 'variants' with VCF data 2477 # warning: CONCAT as || operator 2478 sql_query_update = f""" 2479 UPDATE variants as table_variants 2480 SET INFO = CASE 2481 WHEN INFO NOT IN ('', '.') 2482 THEN INFO 2483 ELSE '' 2484 END || 2485 ( 2486 SELECT 2487 CASE 2488 WHEN table_variants.INFO NOT IN ('','.') 2489 AND table_vcf.INFO NOT IN ('','.') 2490 THEN ';' 2491 ELSE '' 2492 END || 2493 CASE 2494 WHEN table_vcf.INFO NOT IN ('','.') 2495 THEN table_vcf.INFO 2496 ELSE '' 2497 END 2498 FROM {table_vcf} as table_vcf 2499 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2500 AND table_vcf.\"POS\" = table_variants.\"POS\" 2501 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2502 AND table_vcf.\"REF\" = table_variants.\"REF\" 2503 ) 2504 """ 2505 self.conn.execute(sql_query_update) 2506 2507 # Drop temporary table 2508 sql_drop = f"DROP TABLE {table_vcf}" 2509 self.conn.execute(sql_drop)
It creates a temporary table in the SQLite database, loads the VCF file into the temporary table, then updates the INFO column of the variants table with the INFO column of the temporary table
Parameters
- vcf_file: The path to the VCF file you want to update the database with
2511 def drop_variants_table(self) -> None: 2512 """ 2513 > This function drops the variants table 2514 """ 2515 2516 table_variants = self.get_table_variants() 2517 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2518 self.conn.execute(sql_table_variants)
This function drops the variants table
2520 def set_variant_id( 2521 self, variant_id_column: str = "variant_id", force: bool = None 2522 ) -> str: 2523 """ 2524 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2525 `#CHROM`, `POS`, `REF`, and `ALT` columns 2526 2527 :param variant_id_column: The name of the column to be created in the variants table, defaults 2528 to variant_id 2529 :type variant_id_column: str (optional) 2530 :param force: If True, the variant_id column will be created even if it already exists 2531 :type force: bool 2532 :return: The name of the column that contains the variant_id 2533 """ 2534 2535 # Assembly 2536 assembly = self.get_param().get( 2537 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2538 ) 2539 2540 # INFO/Tag prefix 2541 prefix = self.get_explode_infos_prefix() 2542 2543 # Explode INFO/SVTYPE 2544 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2545 2546 # variants table 2547 table_variants = self.get_table_variants() 2548 2549 # variant_id column 2550 if not variant_id_column: 2551 variant_id_column = "variant_id" 2552 2553 # Creta variant_id column 2554 if "variant_id" not in self.get_extra_infos() or force: 2555 2556 # Create column 2557 self.add_column( 2558 table_name=table_variants, 2559 column_name=variant_id_column, 2560 column_type="UBIGINT", 2561 default_value="0", 2562 ) 2563 2564 # Update column 2565 self.conn.execute( 2566 f""" 2567 UPDATE {table_variants} 2568 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2569 """ 2570 ) 2571 2572 # Remove added columns 2573 for added_column in added_columns: 2574 self.drop_column(column=added_column) 2575 2576 # return variant_id column name 2577 return variant_id_column
It adds a column to the variants table called variant_id and populates it with a hash of the
#CHROM, POS, REF, and ALT columns
Parameters
- variant_id_column: The name of the column to be created in the variants table, defaults to variant_id
- force: If True, the variant_id column will be created even if it already exists
Returns
The name of the column that contains the variant_id
2579 def get_variant_id_column( 2580 self, variant_id_column: str = "variant_id", force: bool = None 2581 ) -> str: 2582 """ 2583 This function returns the variant_id column name 2584 2585 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2586 defaults to variant_id 2587 :type variant_id_column: str (optional) 2588 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2589 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2590 if it is not already set, or if it is set 2591 :type force: bool 2592 :return: The variant_id column name. 2593 """ 2594 2595 return self.set_variant_id(variant_id_column=variant_id_column, force=force)
This function returns the variant_id column name
Parameters
- variant_id_column: The name of the column in the dataframe that contains the variant IDs, defaults to variant_id
- force: If True, will force the variant_id to be set to the value of variant_id_column. If False, will only set the variant_id if it is not already set. If None, will set the variant_id if it is not already set, or if it is set
Returns
The variant_id column name.
2601 def scan_databases( 2602 self, database_formats: list["parquet"], database_releases: list = ["current"] 2603 ) -> dict: 2604 """ 2605 The function `scan_databases` scans for available databases based on specified formats and 2606 releases. 2607 2608 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2609 of the databases to be scanned. In this case, the accepted format is "parquet" 2610 :type database_formats: list ["parquet"] 2611 :param database_releases: The `database_releases` parameter is a list that specifies the 2612 releases of the databases to be scanned. In the provided function, the default value for 2613 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2614 databases that are in the "current" 2615 :type database_releases: list 2616 :return: The function `scan_databases` returns a dictionary containing information about 2617 databases that match the specified formats and releases. 2618 """ 2619 2620 # Config 2621 config = self.get_config() 2622 2623 # Param 2624 param = self.get_param() 2625 2626 # Param - Assembly 2627 assembly = param.get("assembly", config.get("assembly", None)) 2628 if not assembly: 2629 assembly = DEFAULT_ASSEMBLY 2630 log.warning(f"Default assembly '{assembly}'") 2631 2632 # Scan for availabled databases 2633 log.info( 2634 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2635 ) 2636 databases_infos_dict = databases_infos( 2637 database_folder_releases=database_releases, 2638 database_formats=database_formats, 2639 assembly=assembly, 2640 config=config, 2641 ) 2642 log.info( 2643 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2644 ) 2645 2646 return databases_infos_dict
The function scan_databases scans for available databases based on specified formats and
releases.
Parameters
- database_formats: The
database_formatsparameter is a list that specifies the formats of the databases to be scanned. In this case, the accepted format is "parquet" - database_releases: The
database_releasesparameter is a list that specifies the releases of the databases to be scanned. In the provided function, the default value fordatabase_releasesis set to["current"], meaning that by default, the function will scan databases that are in the "current"
Returns
The function
scan_databasesreturns a dictionary containing information about databases that match the specified formats and releases.
2648 def annotation(self) -> None: 2649 """ 2650 It annotates the VCF file with the annotations specified in the config file. 2651 """ 2652 2653 # Config 2654 config = self.get_config() 2655 2656 # Param 2657 param = self.get_param() 2658 2659 # Param - Assembly 2660 assembly = param.get("assembly", config.get("assembly", None)) 2661 if not assembly: 2662 assembly = DEFAULT_ASSEMBLY 2663 log.warning(f"Default assembly '{assembly}'") 2664 2665 # annotations databases folders 2666 annotations_databases = set( 2667 config.get("folders", {}) 2668 .get("databases", {}) 2669 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2670 + config.get("folders", {}) 2671 .get("databases", {}) 2672 .get("parquet", ["~/howard/databases/parquet/current"]) 2673 + config.get("folders", {}) 2674 .get("databases", {}) 2675 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2676 ) 2677 2678 # Get param annotations 2679 if param.get("annotations", None) and isinstance( 2680 param.get("annotations", None), str 2681 ): 2682 log.debug(param.get("annotations", None)) 2683 param_annotation_list = param.get("annotations").split(",") 2684 else: 2685 param_annotation_list = [] 2686 2687 # Each tools param 2688 if param.get("annotation_parquet", None) != None: 2689 log.debug( 2690 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2691 ) 2692 if isinstance(param.get("annotation_parquet", None), list): 2693 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2694 else: 2695 param_annotation_list.append(param.get("annotation_parquet")) 2696 if param.get("annotation_snpsift", None) != None: 2697 if isinstance(param.get("annotation_snpsift", None), list): 2698 param_annotation_list.append( 2699 "snpsift:" 2700 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2701 ) 2702 else: 2703 param_annotation_list.append( 2704 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2705 ) 2706 if param.get("annotation_snpeff", None) != None: 2707 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2708 if param.get("annotation_bcftools", None) != None: 2709 if isinstance(param.get("annotation_bcftools", None), list): 2710 param_annotation_list.append( 2711 "bcftools:" 2712 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2713 ) 2714 else: 2715 param_annotation_list.append( 2716 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2717 ) 2718 if param.get("annotation_annovar", None) != None: 2719 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2720 if param.get("annotation_exomiser", None) != None: 2721 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2722 if param.get("annotation_splice", None) != None: 2723 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2724 2725 # Merge param annotations list 2726 param["annotations"] = ",".join(param_annotation_list) 2727 2728 # debug 2729 log.debug(f"param_annotations={param['annotations']}") 2730 2731 if param.get("annotations"): 2732 2733 # Log 2734 # log.info("Annotations - Check annotation parameters") 2735 2736 if not "annotation" in param: 2737 param["annotation"] = {} 2738 2739 # List of annotations parameters 2740 annotations_list_input = {} 2741 if isinstance(param.get("annotations", None), str): 2742 annotation_file_list = [ 2743 value for value in param.get("annotations", "").split(",") 2744 ] 2745 for annotation_file in annotation_file_list: 2746 annotations_list_input[annotation_file] = {"INFO": None} 2747 else: 2748 annotations_list_input = param.get("annotations", {}) 2749 2750 log.info(f"Quick Annotations:") 2751 for annotation_key in list(annotations_list_input.keys()): 2752 log.info(f" {annotation_key}") 2753 2754 # List of annotations and associated fields 2755 annotations_list = {} 2756 2757 for annotation_file in annotations_list_input: 2758 2759 # Explode annotations if ALL 2760 if ( 2761 annotation_file.upper() == "ALL" 2762 or annotation_file.upper().startswith("ALL:") 2763 ): 2764 2765 # check ALL parameters (formats, releases) 2766 annotation_file_split = annotation_file.split(":") 2767 database_formats = "parquet" 2768 database_releases = "current" 2769 for annotation_file_option in annotation_file_split[1:]: 2770 database_all_options_split = annotation_file_option.split("=") 2771 if database_all_options_split[0] == "format": 2772 database_formats = database_all_options_split[1].split("+") 2773 if database_all_options_split[0] == "release": 2774 database_releases = database_all_options_split[1].split("+") 2775 2776 # Scan for availabled databases 2777 databases_infos_dict = self.scan_databases( 2778 database_formats=database_formats, 2779 database_releases=database_releases, 2780 ) 2781 2782 # Add found databases in annotation parameters 2783 for database_infos in databases_infos_dict.keys(): 2784 annotations_list[database_infos] = {"INFO": None} 2785 2786 else: 2787 annotations_list[annotation_file] = annotations_list_input[ 2788 annotation_file 2789 ] 2790 2791 # Check each databases 2792 if len(annotations_list): 2793 2794 log.info( 2795 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 2796 ) 2797 2798 for annotation_file in annotations_list: 2799 2800 # Init 2801 annotations = annotations_list.get(annotation_file, None) 2802 2803 # Annotation snpEff 2804 if annotation_file.startswith("snpeff"): 2805 2806 log.debug(f"Quick Annotation snpEff") 2807 2808 if "snpeff" not in param["annotation"]: 2809 param["annotation"]["snpeff"] = {} 2810 2811 if "options" not in param["annotation"]["snpeff"]: 2812 param["annotation"]["snpeff"]["options"] = "" 2813 2814 # snpEff options in annotations 2815 param["annotation"]["snpeff"]["options"] = "".join( 2816 annotation_file.split(":")[1:] 2817 ) 2818 2819 # Annotation Annovar 2820 elif annotation_file.startswith("annovar"): 2821 2822 log.debug(f"Quick Annotation Annovar") 2823 2824 if "annovar" not in param["annotation"]: 2825 param["annotation"]["annovar"] = {} 2826 2827 if "annotations" not in param["annotation"]["annovar"]: 2828 param["annotation"]["annovar"]["annotations"] = {} 2829 2830 # Options 2831 annotation_file_split = annotation_file.split(":") 2832 for annotation_file_annotation in annotation_file_split[1:]: 2833 if annotation_file_annotation: 2834 param["annotation"]["annovar"]["annotations"][ 2835 annotation_file_annotation 2836 ] = annotations 2837 2838 # Annotation Exomiser 2839 elif annotation_file.startswith("exomiser"): 2840 2841 log.debug(f"Quick Annotation Exomiser") 2842 2843 param["annotation"]["exomiser"] = params_string_to_dict( 2844 annotation_file 2845 ) 2846 2847 # Annotation Splice 2848 elif annotation_file.startswith("splice"): 2849 2850 log.debug(f"Quick Annotation Splice") 2851 2852 param["annotation"]["splice"] = params_string_to_dict( 2853 annotation_file 2854 ) 2855 2856 # Annotation Parquet or BCFTOOLS 2857 else: 2858 2859 # Tools detection 2860 if annotation_file.startswith("bcftools:"): 2861 annotation_tool_initial = "bcftools" 2862 annotation_file = ":".join(annotation_file.split(":")[1:]) 2863 elif annotation_file.startswith("snpsift:"): 2864 annotation_tool_initial = "snpsift" 2865 annotation_file = ":".join(annotation_file.split(":")[1:]) 2866 else: 2867 annotation_tool_initial = None 2868 2869 # list of files 2870 annotation_file_list = annotation_file.replace("+", ":").split( 2871 ":" 2872 ) 2873 2874 for annotation_file in annotation_file_list: 2875 2876 if annotation_file: 2877 2878 # Annotation tool initial 2879 annotation_tool = annotation_tool_initial 2880 2881 # Find file 2882 annotation_file_found = None 2883 2884 # Expand user 2885 annotation_file = full_path(annotation_file) 2886 2887 if os.path.exists(annotation_file): 2888 annotation_file_found = annotation_file 2889 2890 else: 2891 # Find within assembly folders 2892 for annotations_database in annotations_databases: 2893 found_files = find_all( 2894 annotation_file, 2895 os.path.join( 2896 annotations_database, assembly 2897 ), 2898 ) 2899 if len(found_files) > 0: 2900 annotation_file_found = found_files[0] 2901 break 2902 if not annotation_file_found and not assembly: 2903 # Find within folders 2904 for ( 2905 annotations_database 2906 ) in annotations_databases: 2907 found_files = find_all( 2908 annotation_file, annotations_database 2909 ) 2910 if len(found_files) > 0: 2911 annotation_file_found = found_files[0] 2912 break 2913 log.debug( 2914 f"for {annotation_file} annotation_file_found={annotation_file_found}" 2915 ) 2916 2917 # Full path 2918 annotation_file_found = full_path(annotation_file_found) 2919 2920 if annotation_file_found: 2921 2922 database = Database(database=annotation_file_found) 2923 quick_annotation_format = database.get_format() 2924 quick_annotation_is_compressed = ( 2925 database.is_compressed() 2926 ) 2927 quick_annotation_is_indexed = os.path.exists( 2928 f"{annotation_file_found}.tbi" 2929 ) 2930 bcftools_preference = False 2931 2932 # Check Annotation Tool 2933 if not annotation_tool: 2934 if ( 2935 bcftools_preference 2936 and quick_annotation_format 2937 in ["vcf", "bed"] 2938 and quick_annotation_is_compressed 2939 and quick_annotation_is_indexed 2940 ): 2941 annotation_tool = "bcftools" 2942 elif quick_annotation_format in [ 2943 "vcf", 2944 "bed", 2945 "tsv", 2946 "tsv", 2947 "csv", 2948 "json", 2949 "tbl", 2950 "parquet", 2951 "duckdb", 2952 ]: 2953 annotation_tool = "parquet" 2954 else: 2955 log.error( 2956 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 2957 ) 2958 raise ValueError( 2959 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 2960 ) 2961 2962 log.debug( 2963 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 2964 ) 2965 2966 # Annotation Tool dispatch 2967 if annotation_tool: 2968 if annotation_tool not in param["annotation"]: 2969 param["annotation"][annotation_tool] = {} 2970 if ( 2971 "annotations" 2972 not in param["annotation"][annotation_tool] 2973 ): 2974 param["annotation"][annotation_tool][ 2975 "annotations" 2976 ] = {} 2977 param["annotation"][annotation_tool][ 2978 "annotations" 2979 ][annotation_file_found] = annotations 2980 2981 else: 2982 log.error( 2983 f"Quick Annotation File {annotation_file} does NOT exist" 2984 ) 2985 2986 self.set_param(param) 2987 2988 if param.get("annotation", None): 2989 log.info("Annotations") 2990 if param.get("annotation", {}).get("parquet", None): 2991 log.info("Annotations 'parquet'...") 2992 self.annotation_parquet() 2993 if param.get("annotation", {}).get("bcftools", None): 2994 log.info("Annotations 'bcftools'...") 2995 self.annotation_bcftools() 2996 if param.get("annotation", {}).get("snpsift", None): 2997 log.info("Annotations 'snpsift'...") 2998 self.annotation_snpsift() 2999 if param.get("annotation", {}).get("annovar", None): 3000 log.info("Annotations 'annovar'...") 3001 self.annotation_annovar() 3002 if param.get("annotation", {}).get("snpeff", None): 3003 log.info("Annotations 'snpeff'...") 3004 self.annotation_snpeff() 3005 if param.get("annotation", {}).get("exomiser", None) is not None: 3006 log.info("Annotations 'exomiser'...") 3007 self.annotation_exomiser() 3008 if param.get("annotation", {}).get("splice", None) is not None: 3009 log.info("Annotations 'splice' ...") 3010 self.annotation_splice() 3011 3012 # Explode INFOS fields into table fields 3013 if self.get_explode_infos(): 3014 self.explode_infos( 3015 prefix=self.get_explode_infos_prefix(), 3016 fields=self.get_explode_infos_fields(), 3017 force=True, 3018 )
It annotates the VCF file with the annotations specified in the config file.
3020 def annotation_snpsift(self, threads: int = None) -> None: 3021 """ 3022 This function annotate with bcftools 3023 3024 :param threads: Number of threads to use 3025 :return: the value of the variable "return_value". 3026 """ 3027 3028 # DEBUG 3029 log.debug("Start annotation with bcftools databases") 3030 3031 # Threads 3032 if not threads: 3033 threads = self.get_threads() 3034 log.debug("Threads: " + str(threads)) 3035 3036 # Config 3037 config = self.get_config() 3038 log.debug("Config: " + str(config)) 3039 3040 # Config - snpSift 3041 snpsift_bin_command = get_bin_command( 3042 bin="SnpSift.jar", 3043 tool="snpsift", 3044 bin_type="jar", 3045 config=config, 3046 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3047 ) 3048 if not snpsift_bin_command: 3049 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3050 log.error(msg_err) 3051 raise ValueError(msg_err) 3052 3053 # Config - bcftools 3054 bcftools_bin_command = get_bin_command( 3055 bin="bcftools", 3056 tool="bcftools", 3057 bin_type="bin", 3058 config=config, 3059 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3060 ) 3061 if not bcftools_bin_command: 3062 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3063 log.error(msg_err) 3064 raise ValueError(msg_err) 3065 3066 # Config - BCFTools databases folders 3067 databases_folders = set( 3068 self.get_config() 3069 .get("folders", {}) 3070 .get("databases", {}) 3071 .get("annotations", ["."]) 3072 + self.get_config() 3073 .get("folders", {}) 3074 .get("databases", {}) 3075 .get("bcftools", ["."]) 3076 ) 3077 log.debug("Databases annotations: " + str(databases_folders)) 3078 3079 # Param 3080 annotations = ( 3081 self.get_param() 3082 .get("annotation", {}) 3083 .get("snpsift", {}) 3084 .get("annotations", None) 3085 ) 3086 log.debug("Annotations: " + str(annotations)) 3087 3088 # Assembly 3089 assembly = self.get_param().get( 3090 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3091 ) 3092 3093 # Data 3094 table_variants = self.get_table_variants() 3095 3096 # Check if not empty 3097 log.debug("Check if not empty") 3098 sql_query_chromosomes = ( 3099 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3100 ) 3101 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3102 if not sql_query_chromosomes_df["count"][0]: 3103 log.info(f"VCF empty") 3104 return 3105 3106 # VCF header 3107 vcf_reader = self.get_header() 3108 log.debug("Initial header: " + str(vcf_reader.infos)) 3109 3110 # Existing annotations 3111 for vcf_annotation in self.get_header().infos: 3112 3113 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3114 log.debug( 3115 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3116 ) 3117 3118 if annotations: 3119 3120 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3121 3122 # Export VCF file 3123 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3124 3125 # Init 3126 commands = {} 3127 3128 for annotation in annotations: 3129 annotation_fields = annotations[annotation] 3130 3131 # Annotation Name 3132 annotation_name = os.path.basename(annotation) 3133 3134 if not annotation_fields: 3135 annotation_fields = {"INFO": None} 3136 3137 log.debug(f"Annotation '{annotation_name}'") 3138 log.debug( 3139 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3140 ) 3141 3142 # Create Database 3143 database = Database( 3144 database=annotation, 3145 databases_folders=databases_folders, 3146 assembly=assembly, 3147 ) 3148 3149 # Find files 3150 db_file = database.get_database() 3151 db_file = full_path(db_file) 3152 db_hdr_file = database.get_header_file() 3153 db_hdr_file = full_path(db_hdr_file) 3154 db_file_type = database.get_format() 3155 db_tbi_file = f"{db_file}.tbi" 3156 db_file_compressed = database.is_compressed() 3157 3158 # Check if compressed 3159 if not db_file_compressed: 3160 log.error( 3161 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3162 ) 3163 raise ValueError( 3164 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3165 ) 3166 3167 # Check if indexed 3168 if not os.path.exists(db_tbi_file): 3169 log.error( 3170 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3171 ) 3172 raise ValueError( 3173 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3174 ) 3175 3176 # Check index - try to create if not exists 3177 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3178 log.error("Annotation failed: database not valid") 3179 log.error(f"Annotation annotation file: {db_file}") 3180 log.error(f"Annotation annotation header: {db_hdr_file}") 3181 log.error(f"Annotation annotation index: {db_tbi_file}") 3182 raise ValueError( 3183 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3184 ) 3185 else: 3186 3187 log.debug( 3188 f"Annotation '{annotation}' - file: " 3189 + str(db_file) 3190 + " and " 3191 + str(db_hdr_file) 3192 ) 3193 3194 # Load header as VCF object 3195 db_hdr_vcf = Variants(input=db_hdr_file) 3196 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3197 log.debug( 3198 "Annotation database header: " 3199 + str(db_hdr_vcf_header_infos) 3200 ) 3201 3202 # For all fields in database 3203 annotation_fields_full = False 3204 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3205 annotation_fields = { 3206 key: key for key in db_hdr_vcf_header_infos 3207 } 3208 log.debug( 3209 "Annotation database header - All annotations added: " 3210 + str(annotation_fields) 3211 ) 3212 annotation_fields_full = True 3213 3214 # # Create file for field rename 3215 # log.debug("Create file for field rename") 3216 # tmp_rename = NamedTemporaryFile( 3217 # prefix=self.get_prefix(), 3218 # dir=self.get_tmp_dir(), 3219 # suffix=".rename", 3220 # delete=False, 3221 # ) 3222 # tmp_rename_name = tmp_rename.name 3223 # tmp_files.append(tmp_rename_name) 3224 3225 # Number of fields 3226 nb_annotation_field = 0 3227 annotation_list = [] 3228 annotation_infos_rename_list = [] 3229 3230 for annotation_field in annotation_fields: 3231 3232 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3233 annotation_fields_new_name = annotation_fields.get( 3234 annotation_field, annotation_field 3235 ) 3236 if not annotation_fields_new_name: 3237 annotation_fields_new_name = annotation_field 3238 3239 # Check if field is in DB and if field is not elready in input data 3240 if ( 3241 annotation_field in db_hdr_vcf.get_header().infos 3242 and annotation_fields_new_name 3243 not in self.get_header().infos 3244 ): 3245 3246 log.info( 3247 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3248 ) 3249 3250 # BCFTools annotate param to rename fields 3251 if annotation_field != annotation_fields_new_name: 3252 annotation_infos_rename_list.append( 3253 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3254 ) 3255 3256 # Add INFO field to header 3257 db_hdr_vcf_header_infos_number = ( 3258 db_hdr_vcf_header_infos[annotation_field].num or "." 3259 ) 3260 db_hdr_vcf_header_infos_type = ( 3261 db_hdr_vcf_header_infos[annotation_field].type 3262 or "String" 3263 ) 3264 db_hdr_vcf_header_infos_description = ( 3265 db_hdr_vcf_header_infos[annotation_field].desc 3266 or f"{annotation_field} description" 3267 ) 3268 db_hdr_vcf_header_infos_source = ( 3269 db_hdr_vcf_header_infos[annotation_field].source 3270 or "unknown" 3271 ) 3272 db_hdr_vcf_header_infos_version = ( 3273 db_hdr_vcf_header_infos[annotation_field].version 3274 or "unknown" 3275 ) 3276 3277 vcf_reader.infos[annotation_fields_new_name] = ( 3278 vcf.parser._Info( 3279 annotation_fields_new_name, 3280 db_hdr_vcf_header_infos_number, 3281 db_hdr_vcf_header_infos_type, 3282 db_hdr_vcf_header_infos_description, 3283 db_hdr_vcf_header_infos_source, 3284 db_hdr_vcf_header_infos_version, 3285 self.code_type_map[ 3286 db_hdr_vcf_header_infos_type 3287 ], 3288 ) 3289 ) 3290 3291 annotation_list.append(annotation_field) 3292 3293 nb_annotation_field += 1 3294 3295 else: 3296 3297 if ( 3298 annotation_field 3299 not in db_hdr_vcf.get_header().infos 3300 ): 3301 log.warning( 3302 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3303 ) 3304 if ( 3305 annotation_fields_new_name 3306 in self.get_header().infos 3307 ): 3308 log.warning( 3309 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3310 ) 3311 3312 log.info( 3313 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3314 ) 3315 3316 annotation_infos = ",".join(annotation_list) 3317 3318 if annotation_infos != "": 3319 3320 # Annotated VCF (and error file) 3321 tmp_annotation_vcf_name = os.path.join( 3322 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3323 ) 3324 tmp_annotation_vcf_name_err = ( 3325 tmp_annotation_vcf_name + ".err" 3326 ) 3327 3328 # Add fields to annotate 3329 if not annotation_fields_full: 3330 annotation_infos_option = f"-info {annotation_infos}" 3331 else: 3332 annotation_infos_option = "" 3333 3334 # Info fields rename 3335 if annotation_infos_rename_list: 3336 annotation_infos_rename = " -c " + ",".join( 3337 annotation_infos_rename_list 3338 ) 3339 else: 3340 annotation_infos_rename = "" 3341 3342 # Annotate command 3343 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3344 3345 # Add command 3346 commands[command_annotate] = tmp_annotation_vcf_name 3347 3348 if commands: 3349 3350 # Export VCF file 3351 self.export_variant_vcf( 3352 vcf_file=tmp_vcf_name, 3353 remove_info=True, 3354 add_samples=False, 3355 index=True, 3356 ) 3357 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3358 3359 # Num command 3360 nb_command = 0 3361 3362 # Annotate 3363 for command_annotate in commands: 3364 nb_command += 1 3365 log.info( 3366 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3367 ) 3368 log.debug(f"command_annotate={command_annotate}") 3369 run_parallel_commands([command_annotate], threads) 3370 3371 # Debug 3372 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3373 3374 # Update variants 3375 log.info( 3376 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3377 ) 3378 self.update_from_vcf(commands[command_annotate])
This function annotate with bcftools
Parameters
- threads: Number of threads to use
Returns
the value of the variable "return_value".
3380 def annotation_bcftools(self, threads: int = None) -> None: 3381 """ 3382 This function annotate with bcftools 3383 3384 :param threads: Number of threads to use 3385 :return: the value of the variable "return_value". 3386 """ 3387 3388 # DEBUG 3389 log.debug("Start annotation with bcftools databases") 3390 3391 # Threads 3392 if not threads: 3393 threads = self.get_threads() 3394 log.debug("Threads: " + str(threads)) 3395 3396 # Config 3397 config = self.get_config() 3398 log.debug("Config: " + str(config)) 3399 3400 # DEBUG 3401 delete_tmp = True 3402 if self.get_config().get("verbosity", "warning") in ["debug"]: 3403 delete_tmp = False 3404 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 3405 3406 # Config - BCFTools bin command 3407 bcftools_bin_command = get_bin_command( 3408 bin="bcftools", 3409 tool="bcftools", 3410 bin_type="bin", 3411 config=config, 3412 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3413 ) 3414 if not bcftools_bin_command: 3415 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3416 log.error(msg_err) 3417 raise ValueError(msg_err) 3418 3419 # Config - BCFTools databases folders 3420 databases_folders = set( 3421 self.get_config() 3422 .get("folders", {}) 3423 .get("databases", {}) 3424 .get("annotations", ["."]) 3425 + self.get_config() 3426 .get("folders", {}) 3427 .get("databases", {}) 3428 .get("bcftools", ["."]) 3429 ) 3430 log.debug("Databases annotations: " + str(databases_folders)) 3431 3432 # Param 3433 annotations = ( 3434 self.get_param() 3435 .get("annotation", {}) 3436 .get("bcftools", {}) 3437 .get("annotations", None) 3438 ) 3439 log.debug("Annotations: " + str(annotations)) 3440 3441 # Assembly 3442 assembly = self.get_param().get( 3443 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3444 ) 3445 3446 # Data 3447 table_variants = self.get_table_variants() 3448 3449 # Check if not empty 3450 log.debug("Check if not empty") 3451 sql_query_chromosomes = ( 3452 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3453 ) 3454 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3455 if not sql_query_chromosomes_df["count"][0]: 3456 log.info(f"VCF empty") 3457 return 3458 3459 # Export in VCF 3460 log.debug("Create initial file to annotate") 3461 tmp_vcf = NamedTemporaryFile( 3462 prefix=self.get_prefix(), 3463 dir=self.get_tmp_dir(), 3464 suffix=".vcf.gz", 3465 delete=False, 3466 ) 3467 tmp_vcf_name = tmp_vcf.name 3468 3469 # VCF header 3470 vcf_reader = self.get_header() 3471 log.debug("Initial header: " + str(vcf_reader.infos)) 3472 3473 # Existing annotations 3474 for vcf_annotation in self.get_header().infos: 3475 3476 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3477 log.debug( 3478 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3479 ) 3480 3481 if annotations: 3482 3483 tmp_ann_vcf_list = [] 3484 commands = [] 3485 tmp_files = [] 3486 err_files = [] 3487 3488 for annotation in annotations: 3489 annotation_fields = annotations[annotation] 3490 3491 # Annotation Name 3492 annotation_name = os.path.basename(annotation) 3493 3494 if not annotation_fields: 3495 annotation_fields = {"INFO": None} 3496 3497 log.debug(f"Annotation '{annotation_name}'") 3498 log.debug( 3499 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3500 ) 3501 3502 # Create Database 3503 database = Database( 3504 database=annotation, 3505 databases_folders=databases_folders, 3506 assembly=assembly, 3507 ) 3508 3509 # Find files 3510 db_file = database.get_database() 3511 db_file = full_path(db_file) 3512 db_hdr_file = database.get_header_file() 3513 db_hdr_file = full_path(db_hdr_file) 3514 db_file_type = database.get_format() 3515 db_tbi_file = f"{db_file}.tbi" 3516 db_file_compressed = database.is_compressed() 3517 3518 # Check if compressed 3519 if not db_file_compressed: 3520 log.error( 3521 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3522 ) 3523 raise ValueError( 3524 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3525 ) 3526 3527 # Check if indexed 3528 if not os.path.exists(db_tbi_file): 3529 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 3530 raise ValueError( 3531 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3532 ) 3533 3534 # Check index - try to create if not exists 3535 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3536 log.error("Annotation failed: database not valid") 3537 log.error(f"Annotation annotation file: {db_file}") 3538 log.error(f"Annotation annotation header: {db_hdr_file}") 3539 log.error(f"Annotation annotation index: {db_tbi_file}") 3540 raise ValueError( 3541 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3542 ) 3543 else: 3544 3545 log.debug( 3546 f"Annotation '{annotation}' - file: " 3547 + str(db_file) 3548 + " and " 3549 + str(db_hdr_file) 3550 ) 3551 3552 # Load header as VCF object 3553 db_hdr_vcf = Variants(input=db_hdr_file) 3554 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3555 log.debug( 3556 "Annotation database header: " + str(db_hdr_vcf_header_infos) 3557 ) 3558 3559 # For all fields in database 3560 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3561 annotation_fields = { 3562 key: key for key in db_hdr_vcf_header_infos 3563 } 3564 log.debug( 3565 "Annotation database header - All annotations added: " 3566 + str(annotation_fields) 3567 ) 3568 3569 # Number of fields 3570 nb_annotation_field = 0 3571 annotation_list = [] 3572 3573 for annotation_field in annotation_fields: 3574 3575 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3576 annotation_fields_new_name = annotation_fields.get( 3577 annotation_field, annotation_field 3578 ) 3579 if not annotation_fields_new_name: 3580 annotation_fields_new_name = annotation_field 3581 3582 # Check if field is in DB and if field is not elready in input data 3583 if ( 3584 annotation_field in db_hdr_vcf.get_header().infos 3585 and annotation_fields_new_name 3586 not in self.get_header().infos 3587 ): 3588 3589 log.info( 3590 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3591 ) 3592 3593 # Add INFO field to header 3594 db_hdr_vcf_header_infos_number = ( 3595 db_hdr_vcf_header_infos[annotation_field].num or "." 3596 ) 3597 db_hdr_vcf_header_infos_type = ( 3598 db_hdr_vcf_header_infos[annotation_field].type 3599 or "String" 3600 ) 3601 db_hdr_vcf_header_infos_description = ( 3602 db_hdr_vcf_header_infos[annotation_field].desc 3603 or f"{annotation_field} description" 3604 ) 3605 db_hdr_vcf_header_infos_source = ( 3606 db_hdr_vcf_header_infos[annotation_field].source 3607 or "unknown" 3608 ) 3609 db_hdr_vcf_header_infos_version = ( 3610 db_hdr_vcf_header_infos[annotation_field].version 3611 or "unknown" 3612 ) 3613 3614 vcf_reader.infos[annotation_fields_new_name] = ( 3615 vcf.parser._Info( 3616 annotation_fields_new_name, 3617 db_hdr_vcf_header_infos_number, 3618 db_hdr_vcf_header_infos_type, 3619 db_hdr_vcf_header_infos_description, 3620 db_hdr_vcf_header_infos_source, 3621 db_hdr_vcf_header_infos_version, 3622 self.code_type_map[db_hdr_vcf_header_infos_type], 3623 ) 3624 ) 3625 3626 # annotation_list.append(annotation_field) 3627 if annotation_field != annotation_fields_new_name: 3628 annotation_list.append( 3629 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3630 ) 3631 else: 3632 annotation_list.append(annotation_field) 3633 3634 nb_annotation_field += 1 3635 3636 else: 3637 3638 if annotation_field not in db_hdr_vcf.get_header().infos: 3639 log.warning( 3640 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 3641 ) 3642 if annotation_fields_new_name in self.get_header().infos: 3643 log.warning( 3644 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 3645 ) 3646 3647 log.info( 3648 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3649 ) 3650 3651 annotation_infos = ",".join(annotation_list) 3652 3653 if annotation_infos != "": 3654 3655 # Protect header for bcftools (remove "#CHROM" and variants line) 3656 log.debug("Protect Header file - remove #CHROM line if exists") 3657 tmp_header_vcf = NamedTemporaryFile( 3658 prefix=self.get_prefix(), 3659 dir=self.get_tmp_dir(), 3660 suffix=".hdr", 3661 delete=False, 3662 ) 3663 tmp_header_vcf_name = tmp_header_vcf.name 3664 tmp_files.append(tmp_header_vcf_name) 3665 # Command 3666 if db_hdr_file.endswith(".gz"): 3667 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3668 else: 3669 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3670 # Run 3671 run_parallel_commands([command_extract_header], 1) 3672 3673 # Find chomosomes 3674 log.debug("Find chromosomes ") 3675 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 3676 sql_query_chromosomes_df = self.get_query_to_df( 3677 sql_query_chromosomes 3678 ) 3679 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 3680 3681 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 3682 3683 # BED columns in the annotation file 3684 if db_file_type in ["bed"]: 3685 annotation_infos = "CHROM,POS,POS," + annotation_infos 3686 3687 for chrom in chomosomes_list: 3688 3689 # Create BED on initial VCF 3690 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 3691 tmp_bed = NamedTemporaryFile( 3692 prefix=self.get_prefix(), 3693 dir=self.get_tmp_dir(), 3694 suffix=".bed", 3695 delete=False, 3696 ) 3697 tmp_bed_name = tmp_bed.name 3698 tmp_files.append(tmp_bed_name) 3699 3700 # Detecte regions 3701 log.debug( 3702 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 3703 ) 3704 window = 1000000 3705 sql_query_intervals_for_bed = f""" 3706 SELECT \"#CHROM\", 3707 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 3708 \"POS\"+{window} 3709 FROM {table_variants} as table_variants 3710 WHERE table_variants.\"#CHROM\" = '{chrom}' 3711 """ 3712 regions = self.conn.execute( 3713 sql_query_intervals_for_bed 3714 ).fetchall() 3715 merged_regions = merge_regions(regions) 3716 log.debug( 3717 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 3718 ) 3719 3720 header = ["#CHROM", "START", "END"] 3721 with open(tmp_bed_name, "w") as f: 3722 # Write the header with tab delimiter 3723 f.write("\t".join(header) + "\n") 3724 for d in merged_regions: 3725 # Write each data row with tab delimiter 3726 f.write("\t".join(map(str, d)) + "\n") 3727 3728 # Tmp files 3729 tmp_annotation_vcf = NamedTemporaryFile( 3730 prefix=self.get_prefix(), 3731 dir=self.get_tmp_dir(), 3732 suffix=".vcf.gz", 3733 delete=False, 3734 ) 3735 tmp_annotation_vcf_name = tmp_annotation_vcf.name 3736 tmp_files.append(tmp_annotation_vcf_name) 3737 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 3738 tmp_annotation_vcf_name_err = ( 3739 tmp_annotation_vcf_name + ".err" 3740 ) 3741 err_files.append(tmp_annotation_vcf_name_err) 3742 3743 # Annotate Command 3744 log.debug( 3745 f"Annotation '{annotation}' - add bcftools command" 3746 ) 3747 3748 # Command 3749 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3750 3751 # Add command 3752 commands.append(command_annotate) 3753 3754 # if some commands 3755 if commands: 3756 3757 # Export VCF file 3758 self.export_variant_vcf( 3759 vcf_file=tmp_vcf_name, 3760 remove_info=True, 3761 add_samples=False, 3762 index=True, 3763 ) 3764 3765 # Threads 3766 # calculate threads for annotated commands 3767 if commands: 3768 threads_bcftools_annotate = round(threads / len(commands)) 3769 else: 3770 threads_bcftools_annotate = 1 3771 3772 if not threads_bcftools_annotate: 3773 threads_bcftools_annotate = 1 3774 3775 # Add threads option to bcftools commands 3776 if threads_bcftools_annotate > 1: 3777 commands_threaded = [] 3778 for command in commands: 3779 commands_threaded.append( 3780 command.replace( 3781 f"{bcftools_bin_command} annotate ", 3782 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 3783 ) 3784 ) 3785 commands = commands_threaded 3786 3787 # Command annotation multithreading 3788 log.debug(f"Annotation - Annotation commands: " + str(commands)) 3789 log.info( 3790 f"Annotation - Annotation multithreaded in " 3791 + str(len(commands)) 3792 + " commands" 3793 ) 3794 3795 run_parallel_commands(commands, threads) 3796 3797 # Merge 3798 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 3799 3800 if tmp_ann_vcf_list_cmd: 3801 3802 # Tmp file 3803 tmp_annotate_vcf = NamedTemporaryFile( 3804 prefix=self.get_prefix(), 3805 dir=self.get_tmp_dir(), 3806 suffix=".vcf.gz", 3807 delete=True, 3808 ) 3809 tmp_annotate_vcf_name = tmp_annotate_vcf.name 3810 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 3811 err_files.append(tmp_annotate_vcf_name_err) 3812 3813 # Tmp file remove command 3814 tmp_files_remove_command = "" 3815 if tmp_files: 3816 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 3817 3818 # Command merge 3819 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 3820 log.info( 3821 f"Annotation - Annotation merging " 3822 + str(len(commands)) 3823 + " annotated files" 3824 ) 3825 log.debug(f"Annotation - merge command: {merge_command}") 3826 run_parallel_commands([merge_command], 1) 3827 3828 # Error messages 3829 log.info(f"Error/Warning messages:") 3830 error_message_command_all = [] 3831 error_message_command_warning = [] 3832 error_message_command_err = [] 3833 for err_file in err_files: 3834 with open(err_file, "r") as f: 3835 for line in f: 3836 message = line.strip() 3837 error_message_command_all.append(message) 3838 if line.startswith("[W::"): 3839 error_message_command_warning.append(message) 3840 if line.startswith("[E::"): 3841 error_message_command_err.append( 3842 f"{err_file}: " + message 3843 ) 3844 # log info 3845 for message in list( 3846 set(error_message_command_err + error_message_command_warning) 3847 ): 3848 log.info(f" {message}") 3849 # debug info 3850 for message in list(set(error_message_command_all)): 3851 log.debug(f" {message}") 3852 # failed 3853 if len(error_message_command_err): 3854 log.error("Annotation failed: Error in commands") 3855 raise ValueError("Annotation failed: Error in commands") 3856 3857 # Update variants 3858 log.info(f"Annotation - Updating...") 3859 self.update_from_vcf(tmp_annotate_vcf_name)
This function annotate with bcftools
Parameters
- threads: Number of threads to use
Returns
the value of the variable "return_value".
3861 def annotation_exomiser(self, threads: int = None) -> None: 3862 """ 3863 This function annotate with Exomiser 3864 3865 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 3866 - "analysis" (dict/file): 3867 Full analysis dictionnary parameters (see Exomiser docs). 3868 Either a dict, or a file in JSON or YAML format. 3869 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 3870 Default : None 3871 - "preset" (string): 3872 Analysis preset (available in config folder). 3873 Used if no full "analysis" is provided. 3874 Default: "exome" 3875 - "phenopacket" (dict/file): 3876 Samples and phenotipic features parameters (see Exomiser docs). 3877 Either a dict, or a file in JSON or YAML format. 3878 Default: None 3879 - "subject" (dict): 3880 Sample parameters (see Exomiser docs). 3881 Example: 3882 "subject": 3883 { 3884 "id": "ISDBM322017", 3885 "sex": "FEMALE" 3886 } 3887 Default: None 3888 - "sample" (string): 3889 Sample name to construct "subject" section: 3890 "subject": 3891 { 3892 "id": "<sample>", 3893 "sex": "UNKNOWN_SEX" 3894 } 3895 Default: None 3896 - "phenotypicFeatures" (dict) 3897 Phenotypic features to construct "subject" section. 3898 Example: 3899 "phenotypicFeatures": 3900 [ 3901 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 3902 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 3903 ] 3904 - "hpo" (list) 3905 List of HPO ids as phenotypic features. 3906 Example: 3907 "hpo": ['0001156', '0001363', '0011304', '0010055'] 3908 Default: [] 3909 - "outputOptions" (dict): 3910 Output options (see Exomiser docs). 3911 Default: 3912 "output_options" = 3913 { 3914 "outputContributingVariantsOnly": False, 3915 "numGenes": 0, 3916 "outputFormats": ["TSV_VARIANT", "VCF"] 3917 } 3918 - "transcript_source" (string): 3919 Transcript source (either "refseq", "ucsc", "ensembl") 3920 Default: "refseq" 3921 - "exomiser_to_info" (boolean): 3922 Add exomiser TSV file columns as INFO fields in VCF. 3923 Default: False 3924 - "release" (string): 3925 Exomise database release. 3926 If not exists, database release will be downloaded (take a while). 3927 Default: None (provided by application.properties configuration file) 3928 - "exomiser_application_properties" (file): 3929 Exomiser configuration file (see Exomiser docs). 3930 Useful to automatically download databases (especially for specific genome databases). 3931 3932 Notes: 3933 - If no sample in parameters, first sample in VCF will be chosen 3934 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 3935 3936 :param threads: The number of threads to use 3937 :return: None. 3938 """ 3939 3940 # DEBUG 3941 log.debug("Start annotation with Exomiser databases") 3942 3943 # Threads 3944 if not threads: 3945 threads = self.get_threads() 3946 log.debug("Threads: " + str(threads)) 3947 3948 # Config 3949 config = self.get_config() 3950 log.debug("Config: " + str(config)) 3951 3952 # Config - Folders - Databases 3953 databases_folders = ( 3954 config.get("folders", {}) 3955 .get("databases", {}) 3956 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 3957 ) 3958 databases_folders = full_path(databases_folders) 3959 if not os.path.exists(databases_folders): 3960 log.error(f"Databases annotations: {databases_folders} NOT found") 3961 log.debug("Databases annotations: " + str(databases_folders)) 3962 3963 # Config - Exomiser 3964 exomiser_bin_command = get_bin_command( 3965 bin="exomiser-cli*.jar", 3966 tool="exomiser", 3967 bin_type="jar", 3968 config=config, 3969 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 3970 ) 3971 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 3972 if not exomiser_bin_command: 3973 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 3974 log.error(msg_err) 3975 raise ValueError(msg_err) 3976 3977 # Param 3978 param = self.get_param() 3979 log.debug("Param: " + str(param)) 3980 3981 # Param - Exomiser 3982 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 3983 log.debug(f"Param Exomiser: {param_exomiser}") 3984 3985 # Param - Assembly 3986 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 3987 log.debug("Assembly: " + str(assembly)) 3988 3989 # Data 3990 table_variants = self.get_table_variants() 3991 3992 # Check if not empty 3993 log.debug("Check if not empty") 3994 sql_query_chromosomes = ( 3995 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3996 ) 3997 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 3998 log.info(f"VCF empty") 3999 return False 4000 4001 # VCF header 4002 vcf_reader = self.get_header() 4003 log.debug("Initial header: " + str(vcf_reader.infos)) 4004 4005 # Samples 4006 samples = self.get_header_sample_list() 4007 if not samples: 4008 log.error("No Samples in VCF") 4009 return False 4010 log.debug(f"Samples: {samples}") 4011 4012 # Memory limit 4013 memory_limit = self.get_memory("8G") 4014 log.debug(f"memory_limit: {memory_limit}") 4015 4016 # Exomiser java options 4017 exomiser_java_options = ( 4018 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4019 ) 4020 log.debug(f"Exomiser java options: {exomiser_java_options}") 4021 4022 # Download Exomiser (if not exists) 4023 exomiser_release = param_exomiser.get("release", None) 4024 exomiser_application_properties = param_exomiser.get( 4025 "exomiser_application_properties", None 4026 ) 4027 databases_download_exomiser( 4028 assemblies=[assembly], 4029 exomiser_folder=databases_folders, 4030 exomiser_release=exomiser_release, 4031 exomiser_phenotype_release=exomiser_release, 4032 exomiser_application_properties=exomiser_application_properties, 4033 ) 4034 4035 # Force annotation 4036 force_update_annotation = True 4037 4038 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4039 log.debug("Start annotation Exomiser") 4040 4041 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4042 4043 # tmp_dir = "/tmp/exomiser" 4044 4045 ### ANALYSIS ### 4046 ################ 4047 4048 # Create analysis.json through analysis dict 4049 # either analysis in param or by default 4050 # depending on preset exome/genome) 4051 4052 # Init analysis dict 4053 param_exomiser_analysis_dict = {} 4054 4055 # analysis from param 4056 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4057 param_exomiser_analysis = full_path(param_exomiser_analysis) 4058 4059 # If analysis in param -> load anlaysis json 4060 if param_exomiser_analysis: 4061 4062 # If param analysis is a file and exists 4063 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4064 param_exomiser_analysis 4065 ): 4066 # Load analysis file into analysis dict (either yaml or json) 4067 with open(param_exomiser_analysis) as json_file: 4068 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4069 4070 # If param analysis is a dict 4071 elif isinstance(param_exomiser_analysis, dict): 4072 # Load analysis dict into analysis dict (either yaml or json) 4073 param_exomiser_analysis_dict = param_exomiser_analysis 4074 4075 # Error analysis type 4076 else: 4077 log.error(f"Analysis type unknown. Check param file.") 4078 raise ValueError(f"Analysis type unknown. Check param file.") 4079 4080 # Case no input analysis config file/dict 4081 # Use preset (exome/genome) to open default config file 4082 if not param_exomiser_analysis_dict: 4083 4084 # default preset 4085 default_preset = "exome" 4086 4087 # Get param preset or default preset 4088 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4089 4090 # Try to find if preset is a file 4091 if os.path.exists(param_exomiser_preset): 4092 # Preset file is provided in full path 4093 param_exomiser_analysis_default_config_file = ( 4094 param_exomiser_preset 4095 ) 4096 # elif os.path.exists(full_path(param_exomiser_preset)): 4097 # # Preset file is provided in full path 4098 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4099 elif os.path.exists( 4100 os.path.join(folder_config, param_exomiser_preset) 4101 ): 4102 # Preset file is provided a basename in config folder (can be a path with subfolders) 4103 param_exomiser_analysis_default_config_file = os.path.join( 4104 folder_config, param_exomiser_preset 4105 ) 4106 else: 4107 # Construct preset file 4108 param_exomiser_analysis_default_config_file = os.path.join( 4109 folder_config, 4110 f"preset-{param_exomiser_preset}-analysis.json", 4111 ) 4112 4113 # If preset file exists 4114 param_exomiser_analysis_default_config_file = full_path( 4115 param_exomiser_analysis_default_config_file 4116 ) 4117 if os.path.exists(param_exomiser_analysis_default_config_file): 4118 # Load prest file into analysis dict (either yaml or json) 4119 with open( 4120 param_exomiser_analysis_default_config_file 4121 ) as json_file: 4122 # param_exomiser_analysis_dict[""] = json.load(json_file) 4123 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4124 json_file 4125 ) 4126 4127 # Error preset file 4128 else: 4129 log.error( 4130 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4131 ) 4132 raise ValueError( 4133 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4134 ) 4135 4136 # If no analysis dict created 4137 if not param_exomiser_analysis_dict: 4138 log.error(f"No analysis config") 4139 raise ValueError(f"No analysis config") 4140 4141 # Log 4142 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4143 4144 ### PHENOPACKET ### 4145 ################### 4146 4147 # If no PhenoPacket in analysis dict -> check in param 4148 if "phenopacket" not in param_exomiser_analysis_dict: 4149 4150 # If PhenoPacket in param -> load anlaysis json 4151 if param_exomiser.get("phenopacket", None): 4152 4153 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4154 param_exomiser_phenopacket = full_path( 4155 param_exomiser_phenopacket 4156 ) 4157 4158 # If param phenopacket is a file and exists 4159 if isinstance( 4160 param_exomiser_phenopacket, str 4161 ) and os.path.exists(param_exomiser_phenopacket): 4162 # Load phenopacket file into analysis dict (either yaml or json) 4163 with open(param_exomiser_phenopacket) as json_file: 4164 param_exomiser_analysis_dict["phenopacket"] = ( 4165 yaml.safe_load(json_file) 4166 ) 4167 4168 # If param phenopacket is a dict 4169 elif isinstance(param_exomiser_phenopacket, dict): 4170 # Load phenopacket dict into analysis dict (either yaml or json) 4171 param_exomiser_analysis_dict["phenopacket"] = ( 4172 param_exomiser_phenopacket 4173 ) 4174 4175 # Error phenopacket type 4176 else: 4177 log.error(f"Phenopacket type unknown. Check param file.") 4178 raise ValueError( 4179 f"Phenopacket type unknown. Check param file." 4180 ) 4181 4182 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4183 if "phenopacket" not in param_exomiser_analysis_dict: 4184 4185 # Init PhenoPacket 4186 param_exomiser_analysis_dict["phenopacket"] = { 4187 "id": "analysis", 4188 "proband": {}, 4189 } 4190 4191 ### Add subject ### 4192 4193 # If subject exists 4194 param_exomiser_subject = param_exomiser.get("subject", {}) 4195 4196 # If subject not exists -> found sample ID 4197 if not param_exomiser_subject: 4198 4199 # Found sample ID in param 4200 sample = param_exomiser.get("sample", None) 4201 4202 # Find sample ID (first sample) 4203 if not sample: 4204 sample_list = self.get_header_sample_list() 4205 if len(sample_list) > 0: 4206 sample = sample_list[0] 4207 else: 4208 log.error(f"No sample found") 4209 raise ValueError(f"No sample found") 4210 4211 # Create subject 4212 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4213 4214 # Add to dict 4215 param_exomiser_analysis_dict["phenopacket"][ 4216 "subject" 4217 ] = param_exomiser_subject 4218 4219 ### Add "phenotypicFeatures" ### 4220 4221 # If phenotypicFeatures exists 4222 param_exomiser_phenotypicfeatures = param_exomiser.get( 4223 "phenotypicFeatures", [] 4224 ) 4225 4226 # If phenotypicFeatures not exists -> Try to infer from hpo list 4227 if not param_exomiser_phenotypicfeatures: 4228 4229 # Found HPO in param 4230 param_exomiser_hpo = param_exomiser.get("hpo", []) 4231 4232 # Split HPO if list in string format separated by comma 4233 if isinstance(param_exomiser_hpo, str): 4234 param_exomiser_hpo = param_exomiser_hpo.split(",") 4235 4236 # Create HPO list 4237 for hpo in param_exomiser_hpo: 4238 hpo_clean = re.sub("[^0-9]", "", hpo) 4239 param_exomiser_phenotypicfeatures.append( 4240 { 4241 "type": { 4242 "id": f"HP:{hpo_clean}", 4243 "label": f"HP:{hpo_clean}", 4244 } 4245 } 4246 ) 4247 4248 # Add to dict 4249 param_exomiser_analysis_dict["phenopacket"][ 4250 "phenotypicFeatures" 4251 ] = param_exomiser_phenotypicfeatures 4252 4253 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4254 if not param_exomiser_phenotypicfeatures: 4255 for step in param_exomiser_analysis_dict.get( 4256 "analysis", {} 4257 ).get("steps", []): 4258 if "hiPhivePrioritiser" in step: 4259 param_exomiser_analysis_dict.get("analysis", {}).get( 4260 "steps", [] 4261 ).remove(step) 4262 4263 ### Add Input File ### 4264 4265 # Initial file name and htsFiles 4266 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4267 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4268 { 4269 "uri": tmp_vcf_name, 4270 "htsFormat": "VCF", 4271 "genomeAssembly": assembly, 4272 } 4273 ] 4274 4275 ### Add metaData ### 4276 4277 # If metaData not in analysis dict 4278 if "metaData" not in param_exomiser_analysis_dict: 4279 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4280 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4281 "createdBy": "howard", 4282 "phenopacketSchemaVersion": 1, 4283 } 4284 4285 ### OutputOptions ### 4286 4287 # Init output result folder 4288 output_results = os.path.join(tmp_dir, "results") 4289 4290 # If no outputOptions in analysis dict 4291 if "outputOptions" not in param_exomiser_analysis_dict: 4292 4293 # default output formats 4294 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4295 4296 # Get outputOptions in param 4297 output_options = param_exomiser.get("outputOptions", None) 4298 4299 # If no output_options in param -> check 4300 if not output_options: 4301 output_options = { 4302 "outputContributingVariantsOnly": False, 4303 "numGenes": 0, 4304 "outputFormats": defaut_output_formats, 4305 } 4306 4307 # Replace outputDirectory in output options 4308 output_options["outputDirectory"] = output_results 4309 output_options["outputFileName"] = "howard" 4310 4311 # Add outputOptions in analysis dict 4312 param_exomiser_analysis_dict["outputOptions"] = output_options 4313 4314 else: 4315 4316 # Replace output_results and output format (if exists in param) 4317 param_exomiser_analysis_dict["outputOptions"][ 4318 "outputDirectory" 4319 ] = output_results 4320 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4321 list( 4322 set( 4323 param_exomiser_analysis_dict.get( 4324 "outputOptions", {} 4325 ).get("outputFormats", []) 4326 + ["TSV_VARIANT", "VCF"] 4327 ) 4328 ) 4329 ) 4330 4331 # log 4332 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4333 4334 ### ANALYSIS FILE ### 4335 ##################### 4336 4337 ### Full JSON analysis config file ### 4338 4339 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4340 with open(exomiser_analysis, "w") as fp: 4341 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4342 4343 ### SPLIT analysis and sample config files 4344 4345 # Splitted analysis dict 4346 param_exomiser_analysis_dict_for_split = ( 4347 param_exomiser_analysis_dict.copy() 4348 ) 4349 4350 # Phenopacket JSON file 4351 exomiser_analysis_phenopacket = os.path.join( 4352 tmp_dir, "analysis_phenopacket.json" 4353 ) 4354 with open(exomiser_analysis_phenopacket, "w") as fp: 4355 json.dump( 4356 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4357 fp, 4358 indent=4, 4359 ) 4360 4361 # Analysis JSON file without Phenopacket parameters 4362 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4363 exomiser_analysis_analysis = os.path.join( 4364 tmp_dir, "analysis_analysis.json" 4365 ) 4366 with open(exomiser_analysis_analysis, "w") as fp: 4367 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4368 4369 ### INITAL VCF file ### 4370 ####################### 4371 4372 ### Create list of samples to use and include inti initial VCF file #### 4373 4374 # Subject (main sample) 4375 # Get sample ID in analysis dict 4376 sample_subject = ( 4377 param_exomiser_analysis_dict.get("phenopacket", {}) 4378 .get("subject", {}) 4379 .get("id", None) 4380 ) 4381 sample_proband = ( 4382 param_exomiser_analysis_dict.get("phenopacket", {}) 4383 .get("proband", {}) 4384 .get("subject", {}) 4385 .get("id", None) 4386 ) 4387 sample = [] 4388 if sample_subject: 4389 sample.append(sample_subject) 4390 if sample_proband: 4391 sample.append(sample_proband) 4392 4393 # Get sample ID within Pedigree 4394 pedigree_persons_list = ( 4395 param_exomiser_analysis_dict.get("phenopacket", {}) 4396 .get("pedigree", {}) 4397 .get("persons", {}) 4398 ) 4399 4400 # Create list with all sample ID in pedigree (if exists) 4401 pedigree_persons = [] 4402 for person in pedigree_persons_list: 4403 pedigree_persons.append(person.get("individualId")) 4404 4405 # Concat subject sample ID and samples ID in pedigreesamples 4406 samples = list(set(sample + pedigree_persons)) 4407 4408 # Check if sample list is not empty 4409 if not samples: 4410 log.error(f"No samples found") 4411 raise ValueError(f"No samples found") 4412 4413 # Create VCF with sample (either sample in param or first one by default) 4414 # Export VCF file 4415 self.export_variant_vcf( 4416 vcf_file=tmp_vcf_name, 4417 remove_info=True, 4418 add_samples=True, 4419 list_samples=samples, 4420 index=False, 4421 ) 4422 4423 ### Execute Exomiser ### 4424 ######################## 4425 4426 # Init command 4427 exomiser_command = "" 4428 4429 # Command exomiser options 4430 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 4431 4432 # Release 4433 exomiser_release = param_exomiser.get("release", None) 4434 if exomiser_release: 4435 # phenotype data version 4436 exomiser_options += ( 4437 f" --exomiser.phenotype.data-version={exomiser_release} " 4438 ) 4439 # data version 4440 exomiser_options += ( 4441 f" --exomiser.{assembly}.data-version={exomiser_release} " 4442 ) 4443 # variant white list 4444 variant_white_list_file = ( 4445 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 4446 ) 4447 if os.path.exists( 4448 os.path.join( 4449 databases_folders, assembly, variant_white_list_file 4450 ) 4451 ): 4452 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 4453 4454 # transcript_source 4455 transcript_source = param_exomiser.get( 4456 "transcript_source", None 4457 ) # ucsc, refseq, ensembl 4458 if transcript_source: 4459 exomiser_options += ( 4460 f" --exomiser.{assembly}.transcript-source={transcript_source} " 4461 ) 4462 4463 # If analysis contain proband param 4464 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 4465 "proband", {} 4466 ): 4467 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 4468 4469 # If no proband (usually uniq sample) 4470 else: 4471 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 4472 4473 # Log 4474 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 4475 4476 # Run command 4477 result = subprocess.call( 4478 exomiser_command_analysis.split(), stdout=subprocess.PIPE 4479 ) 4480 if result: 4481 log.error("Exomiser command failed") 4482 raise ValueError("Exomiser command failed") 4483 4484 ### RESULTS ### 4485 ############### 4486 4487 ### Annotate with TSV fields ### 4488 4489 # Init result tsv file 4490 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 4491 4492 # Init result tsv file 4493 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 4494 4495 # Parse TSV file and explode columns in INFO field 4496 if exomiser_to_info and os.path.exists(output_results_tsv): 4497 4498 # Log 4499 log.debug("Exomiser columns to VCF INFO field") 4500 4501 # Retrieve columns and types 4502 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 4503 output_results_tsv_df = self.get_query_to_df(query) 4504 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 4505 4506 # Init concat fields for update 4507 sql_query_update_concat_fields = [] 4508 4509 # Fields to avoid 4510 fields_to_avoid = [ 4511 "CONTIG", 4512 "START", 4513 "END", 4514 "REF", 4515 "ALT", 4516 "QUAL", 4517 "FILTER", 4518 "GENOTYPE", 4519 ] 4520 4521 # List all columns to add into header 4522 for header_column in output_results_tsv_columns: 4523 4524 # If header column is enable 4525 if header_column not in fields_to_avoid: 4526 4527 # Header info type 4528 header_info_type = "String" 4529 header_column_df = output_results_tsv_df[header_column] 4530 header_column_df_dtype = header_column_df.dtype 4531 if header_column_df_dtype == object: 4532 if ( 4533 pd.to_numeric(header_column_df, errors="coerce") 4534 .notnull() 4535 .all() 4536 ): 4537 header_info_type = "Float" 4538 else: 4539 header_info_type = "Integer" 4540 4541 # Header info 4542 characters_to_validate = ["-"] 4543 pattern = "[" + "".join(characters_to_validate) + "]" 4544 header_info_name = re.sub( 4545 pattern, 4546 "_", 4547 f"Exomiser_{header_column}".replace("#", ""), 4548 ) 4549 header_info_number = "." 4550 header_info_description = ( 4551 f"Exomiser {header_column} annotation" 4552 ) 4553 header_info_source = "Exomiser" 4554 header_info_version = "unknown" 4555 header_info_code = CODE_TYPE_MAP[header_info_type] 4556 vcf_reader.infos[header_info_name] = vcf.parser._Info( 4557 header_info_name, 4558 header_info_number, 4559 header_info_type, 4560 header_info_description, 4561 header_info_source, 4562 header_info_version, 4563 header_info_code, 4564 ) 4565 4566 # Add field to add for update to concat fields 4567 sql_query_update_concat_fields.append( 4568 f""" 4569 CASE 4570 WHEN table_parquet."{header_column}" NOT IN ('','.') 4571 THEN concat( 4572 '{header_info_name}=', 4573 table_parquet."{header_column}", 4574 ';' 4575 ) 4576 4577 ELSE '' 4578 END 4579 """ 4580 ) 4581 4582 # Update query 4583 sql_query_update = f""" 4584 UPDATE {table_variants} as table_variants 4585 SET INFO = concat( 4586 CASE 4587 WHEN INFO NOT IN ('', '.') 4588 THEN INFO 4589 ELSE '' 4590 END, 4591 CASE 4592 WHEN table_variants.INFO NOT IN ('','.') 4593 THEN ';' 4594 ELSE '' 4595 END, 4596 ( 4597 SELECT 4598 concat( 4599 {",".join(sql_query_update_concat_fields)} 4600 ) 4601 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 4602 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 4603 AND table_parquet.\"START\" = table_variants.\"POS\" 4604 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 4605 AND table_parquet.\"REF\" = table_variants.\"REF\" 4606 ) 4607 ) 4608 ; 4609 """ 4610 4611 # Update 4612 self.conn.execute(sql_query_update) 4613 4614 ### Annotate with VCF INFO field ### 4615 4616 # Init result VCF file 4617 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 4618 4619 # If VCF exists 4620 if os.path.exists(output_results_vcf): 4621 4622 # Log 4623 log.debug("Exomiser result VCF update variants") 4624 4625 # Find Exomiser INFO field annotation in header 4626 with gzip.open(output_results_vcf, "rt") as f: 4627 header_list = self.read_vcf_header(f) 4628 exomiser_vcf_header = vcf.Reader( 4629 io.StringIO("\n".join(header_list)) 4630 ) 4631 4632 # Add annotation INFO field to header 4633 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 4634 4635 # Update variants with VCF 4636 self.update_from_vcf(output_results_vcf) 4637 4638 return True
This function annotate with Exomiser
This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
- "analysis" (dict/file): Full analysis dictionnary parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) Default : None
- "preset" (string): Analysis preset (available in config folder). Used if no full "analysis" is provided. Default: "exome"
- "phenopacket" (dict/file): Samples and phenotipic features parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. Default: None
- "subject" (dict): Sample parameters (see Exomiser docs). Example: "subject": { "id": "ISDBM322017", "sex": "FEMALE" } Default: None
- "sample" (string):
Sample name to construct "subject" section:
"subject":
{
"id": "
", "sex": "UNKNOWN_SEX" } Default: None - "phenotypicFeatures" (dict) Phenotypic features to construct "subject" section. Example: "phenotypicFeatures": [ { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, { "type": { "id": "HP:0000486", "label": "Strabismus" } } ]
- "hpo" (list) List of HPO ids as phenotypic features. Example: "hpo": ['0001156', '0001363', '0011304', '0010055'] Default: []
- "outputOptions" (dict): Output options (see Exomiser docs). Default: "output_options" = { "outputContributingVariantsOnly": False, "numGenes": 0, "outputFormats": ["TSV_VARIANT", "VCF"] }
- "transcript_source" (string): Transcript source (either "refseq", "ucsc", "ensembl") Default: "refseq"
- "exomiser_to_info" (boolean): Add exomiser TSV file columns as INFO fields in VCF. Default: False
- "release" (string): Exomise database release. If not exists, database release will be downloaded (take a while). Default: None (provided by application.properties configuration file)
- "exomiser_application_properties" (file): Exomiser configuration file (see Exomiser docs). Useful to automatically download databases (especially for specific genome databases).
Notes:
- If no sample in parameters, first sample in VCF will be chosen
- If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
Parameters
- threads: The number of threads to use
Returns
None.
4640 def annotation_snpeff(self, threads: int = None) -> None: 4641 """ 4642 This function annotate with snpEff 4643 4644 :param threads: The number of threads to use 4645 :return: the value of the variable "return_value". 4646 """ 4647 4648 # DEBUG 4649 log.debug("Start annotation with snpeff databases") 4650 4651 # Threads 4652 if not threads: 4653 threads = self.get_threads() 4654 log.debug("Threads: " + str(threads)) 4655 4656 # DEBUG 4657 delete_tmp = True 4658 if self.get_config().get("verbosity", "warning") in ["debug"]: 4659 delete_tmp = False 4660 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4661 4662 # Config 4663 config = self.get_config() 4664 log.debug("Config: " + str(config)) 4665 4666 # Config - Folders - Databases 4667 databases_folders = ( 4668 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 4669 ) 4670 log.debug("Databases annotations: " + str(databases_folders)) 4671 4672 # # Config - Java 4673 # java_bin = get_bin( 4674 # tool="java", 4675 # bin="java", 4676 # bin_type="bin", 4677 # config=config, 4678 # default_folder="/usr/bin", 4679 # ) 4680 # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))): 4681 # log.error(f"Annotation failed: no java bin '{java_bin}'") 4682 # raise ValueError(f"Annotation failed: no java bin '{java_bin}'") 4683 4684 # # Config - snpEff bin 4685 # snpeff_jar = get_bin( 4686 # tool="snpeff", 4687 # bin="snpEff.jar", 4688 # bin_type="jar", 4689 # config=config, 4690 # default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4691 # ) 4692 # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))): 4693 # log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4694 # raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4695 4696 # Config - snpEff bin command 4697 snpeff_bin_command = get_bin_command( 4698 bin="snpEff.jar", 4699 tool="snpeff", 4700 bin_type="jar", 4701 config=config, 4702 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4703 ) 4704 if not snpeff_bin_command: 4705 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 4706 log.error(msg_err) 4707 raise ValueError(msg_err) 4708 4709 # Config - snpEff databases 4710 snpeff_databases = ( 4711 config.get("folders", {}) 4712 .get("databases", {}) 4713 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 4714 ) 4715 snpeff_databases = full_path(snpeff_databases) 4716 if snpeff_databases is not None and snpeff_databases != "": 4717 log.debug(f"Create snpEff databases folder") 4718 if not os.path.exists(snpeff_databases): 4719 os.makedirs(snpeff_databases) 4720 4721 # Param 4722 param = self.get_param() 4723 log.debug("Param: " + str(param)) 4724 4725 # Param 4726 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 4727 log.debug("Options: " + str(options)) 4728 4729 # Param - Assembly 4730 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4731 4732 # Param - Options 4733 snpeff_options = ( 4734 param.get("annotation", {}).get("snpeff", {}).get("options", "") 4735 ) 4736 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 4737 snpeff_csvstats = ( 4738 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 4739 ) 4740 if snpeff_stats: 4741 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 4742 snpeff_stats = full_path(snpeff_stats) 4743 snpeff_options += f" -stats {snpeff_stats}" 4744 if snpeff_csvstats: 4745 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 4746 snpeff_csvstats = full_path(snpeff_csvstats) 4747 snpeff_options += f" -csvStats {snpeff_csvstats}" 4748 4749 # Data 4750 table_variants = self.get_table_variants() 4751 4752 # Check if not empty 4753 log.debug("Check if not empty") 4754 sql_query_chromosomes = ( 4755 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4756 ) 4757 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 4758 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4759 log.info(f"VCF empty") 4760 return 4761 4762 # Export in VCF 4763 log.debug("Create initial file to annotate") 4764 tmp_vcf = NamedTemporaryFile( 4765 prefix=self.get_prefix(), 4766 dir=self.get_tmp_dir(), 4767 suffix=".vcf.gz", 4768 delete=True, 4769 ) 4770 tmp_vcf_name = tmp_vcf.name 4771 4772 # VCF header 4773 vcf_reader = self.get_header() 4774 log.debug("Initial header: " + str(vcf_reader.infos)) 4775 4776 # Existing annotations 4777 for vcf_annotation in self.get_header().infos: 4778 4779 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4780 log.debug( 4781 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4782 ) 4783 4784 # Memory limit 4785 # if config.get("memory", None): 4786 # memory_limit = config.get("memory", "8G") 4787 # else: 4788 # memory_limit = "8G" 4789 memory_limit = self.get_memory("8G") 4790 log.debug(f"memory_limit: {memory_limit}") 4791 4792 # snpEff java options 4793 snpeff_java_options = ( 4794 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4795 ) 4796 log.debug(f"Exomiser java options: {snpeff_java_options}") 4797 4798 force_update_annotation = True 4799 4800 if "ANN" not in self.get_header().infos or force_update_annotation: 4801 4802 # Check snpEff database 4803 log.debug(f"Check snpEff databases {[assembly]}") 4804 databases_download_snpeff( 4805 folder=snpeff_databases, assemblies=[assembly], config=config 4806 ) 4807 4808 # Export VCF file 4809 self.export_variant_vcf( 4810 vcf_file=tmp_vcf_name, 4811 remove_info=True, 4812 add_samples=False, 4813 index=True, 4814 ) 4815 4816 # Tmp file 4817 err_files = [] 4818 tmp_annotate_vcf = NamedTemporaryFile( 4819 prefix=self.get_prefix(), 4820 dir=self.get_tmp_dir(), 4821 suffix=".vcf", 4822 delete=False, 4823 ) 4824 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4825 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4826 err_files.append(tmp_annotate_vcf_name_err) 4827 4828 # Command 4829 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 4830 log.debug(f"Annotation - snpEff command: {snpeff_command}") 4831 run_parallel_commands([snpeff_command], 1) 4832 4833 # Error messages 4834 log.info(f"Error/Warning messages:") 4835 error_message_command_all = [] 4836 error_message_command_warning = [] 4837 error_message_command_err = [] 4838 for err_file in err_files: 4839 with open(err_file, "r") as f: 4840 for line in f: 4841 message = line.strip() 4842 error_message_command_all.append(message) 4843 if line.startswith("[W::"): 4844 error_message_command_warning.append(message) 4845 if line.startswith("[E::"): 4846 error_message_command_err.append(f"{err_file}: " + message) 4847 # log info 4848 for message in list( 4849 set(error_message_command_err + error_message_command_warning) 4850 ): 4851 log.info(f" {message}") 4852 # debug info 4853 for message in list(set(error_message_command_all)): 4854 log.debug(f" {message}") 4855 # failed 4856 if len(error_message_command_err): 4857 log.error("Annotation failed: Error in commands") 4858 raise ValueError("Annotation failed: Error in commands") 4859 4860 # Find annotation in header 4861 with open(tmp_annotate_vcf_name, "rt") as f: 4862 header_list = self.read_vcf_header(f) 4863 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 4864 4865 for ann in annovar_vcf_header.infos: 4866 if ann not in self.get_header().infos: 4867 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 4868 4869 # Update variants 4870 log.info(f"Annotation - Updating...") 4871 self.update_from_vcf(tmp_annotate_vcf_name) 4872 4873 else: 4874 if "ANN" in self.get_header().infos: 4875 log.debug(f"Existing snpEff annotations in VCF") 4876 if force_update_annotation: 4877 log.debug(f"Existing snpEff annotations in VCF - annotation forced")
This function annotate with snpEff
Parameters
- threads: The number of threads to use
Returns
the value of the variable "return_value".
4879 def annotation_annovar(self, threads: int = None) -> None: 4880 """ 4881 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 4882 annotations 4883 4884 :param threads: number of threads to use 4885 :return: the value of the variable "return_value". 4886 """ 4887 4888 # DEBUG 4889 log.debug("Start annotation with Annovar databases") 4890 4891 # Threads 4892 if not threads: 4893 threads = self.get_threads() 4894 log.debug("Threads: " + str(threads)) 4895 4896 # Tmp en Err files 4897 tmp_files = [] 4898 err_files = [] 4899 4900 # DEBUG 4901 delete_tmp = True 4902 if self.get_config().get("verbosity", "warning") in ["debug"]: 4903 delete_tmp = False 4904 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4905 4906 # Config 4907 config = self.get_config() 4908 log.debug("Config: " + str(config)) 4909 4910 # Config - Folders - Databases 4911 databases_folders = ( 4912 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 4913 ) 4914 log.debug("Databases annotations: " + str(databases_folders)) 4915 4916 # Config - annovar bin command 4917 annovar_bin_command = get_bin_command( 4918 bin="table_annovar.pl", 4919 tool="annovar", 4920 bin_type="perl", 4921 config=config, 4922 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 4923 ) 4924 if not annovar_bin_command: 4925 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 4926 log.error(msg_err) 4927 raise ValueError(msg_err) 4928 4929 # Config - BCFTools bin command 4930 bcftools_bin_command = get_bin_command( 4931 bin="bcftools", 4932 tool="bcftools", 4933 bin_type="bin", 4934 config=config, 4935 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 4936 ) 4937 if not bcftools_bin_command: 4938 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 4939 log.error(msg_err) 4940 raise ValueError(msg_err) 4941 4942 # Config - annovar databases 4943 annovar_databases = ( 4944 config.get("folders", {}) 4945 .get("databases", {}) 4946 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 4947 ) 4948 annovar_databases = full_path(annovar_databases) 4949 if annovar_databases != "" and not os.path.exists(annovar_databases): 4950 os.makedirs(annovar_databases) 4951 4952 # Param 4953 param = self.get_param() 4954 log.debug("Param: " + str(param)) 4955 4956 # Param - options 4957 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 4958 log.debug("Options: " + str(options)) 4959 4960 # Param - annotations 4961 annotations = ( 4962 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 4963 ) 4964 log.debug("Annotations: " + str(annotations)) 4965 4966 # Param - Assembly 4967 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4968 4969 # Annovar database assembly 4970 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 4971 if annovar_databases_assembly != "" and not os.path.exists( 4972 annovar_databases_assembly 4973 ): 4974 os.makedirs(annovar_databases_assembly) 4975 4976 # Data 4977 table_variants = self.get_table_variants() 4978 4979 # Check if not empty 4980 log.debug("Check if not empty") 4981 sql_query_chromosomes = ( 4982 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4983 ) 4984 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 4985 if not sql_query_chromosomes_df["count"][0]: 4986 log.info(f"VCF empty") 4987 return 4988 4989 # VCF header 4990 vcf_reader = self.get_header() 4991 log.debug("Initial header: " + str(vcf_reader.infos)) 4992 4993 # Existing annotations 4994 for vcf_annotation in self.get_header().infos: 4995 4996 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4997 log.debug( 4998 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4999 ) 5000 5001 force_update_annotation = True 5002 5003 if annotations: 5004 5005 commands = [] 5006 tmp_annotates_vcf_name_list = [] 5007 5008 # Export in VCF 5009 log.debug("Create initial file to annotate") 5010 tmp_vcf = NamedTemporaryFile( 5011 prefix=self.get_prefix(), 5012 dir=self.get_tmp_dir(), 5013 suffix=".vcf.gz", 5014 delete=False, 5015 ) 5016 tmp_vcf_name = tmp_vcf.name 5017 tmp_files.append(tmp_vcf_name) 5018 tmp_files.append(tmp_vcf_name + ".tbi") 5019 5020 # Export VCF file 5021 self.export_variant_vcf( 5022 vcf_file=tmp_vcf_name, 5023 remove_info=".", 5024 add_samples=False, 5025 index=True, 5026 ) 5027 5028 # Create file for field rename 5029 log.debug("Create file for field rename") 5030 tmp_rename = NamedTemporaryFile( 5031 prefix=self.get_prefix(), 5032 dir=self.get_tmp_dir(), 5033 suffix=".rename", 5034 delete=False, 5035 ) 5036 tmp_rename_name = tmp_rename.name 5037 tmp_files.append(tmp_rename_name) 5038 5039 # Check Annovar database 5040 log.debug( 5041 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5042 ) 5043 databases_download_annovar( 5044 folder=annovar_databases, 5045 files=list(annotations.keys()), 5046 assemblies=[assembly], 5047 ) 5048 5049 for annotation in annotations: 5050 annotation_fields = annotations[annotation] 5051 5052 if not annotation_fields: 5053 annotation_fields = {"INFO": None} 5054 5055 log.info(f"Annotations Annovar - database '{annotation}'") 5056 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5057 5058 # Tmp file for annovar 5059 err_files = [] 5060 tmp_annotate_vcf_directory = TemporaryDirectory( 5061 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5062 ) 5063 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5064 tmp_annotate_vcf_name_annovar = ( 5065 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5066 ) 5067 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5068 err_files.append(tmp_annotate_vcf_name_err) 5069 tmp_files.append(tmp_annotate_vcf_name_err) 5070 5071 # Tmp file final vcf annotated by annovar 5072 tmp_annotate_vcf = NamedTemporaryFile( 5073 prefix=self.get_prefix(), 5074 dir=self.get_tmp_dir(), 5075 suffix=".vcf.gz", 5076 delete=False, 5077 ) 5078 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5079 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5080 tmp_files.append(tmp_annotate_vcf_name) 5081 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5082 5083 # Number of fields 5084 annotation_list = [] 5085 annotation_renamed_list = [] 5086 5087 for annotation_field in annotation_fields: 5088 5089 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5090 annotation_fields_new_name = annotation_fields.get( 5091 annotation_field, annotation_field 5092 ) 5093 if not annotation_fields_new_name: 5094 annotation_fields_new_name = annotation_field 5095 5096 if ( 5097 force_update_annotation 5098 or annotation_fields_new_name not in self.get_header().infos 5099 ): 5100 annotation_list.append(annotation_field) 5101 annotation_renamed_list.append(annotation_fields_new_name) 5102 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5103 log.warning( 5104 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5105 ) 5106 5107 # Add rename info 5108 run_parallel_commands( 5109 [ 5110 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5111 ], 5112 1, 5113 ) 5114 5115 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5116 log.debug("annotation_list: " + str(annotation_list)) 5117 5118 # protocol 5119 protocol = annotation 5120 5121 # argument 5122 argument = "" 5123 5124 # operation 5125 operation = "f" 5126 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5127 "ensGene" 5128 ): 5129 operation = "g" 5130 if options.get("genebase", None): 5131 argument = f"""'{options.get("genebase","")}'""" 5132 elif annotation in ["cytoBand"]: 5133 operation = "r" 5134 5135 # argument option 5136 argument_option = "" 5137 if argument != "": 5138 argument_option = " --argument " + argument 5139 5140 # command options 5141 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5142 for option in options: 5143 if option not in ["genebase"]: 5144 command_options += f""" --{option}={options[option]}""" 5145 5146 # Command 5147 5148 # Command - Annovar 5149 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5150 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5151 5152 # Command - start pipe 5153 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5154 5155 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5156 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5157 5158 # Command - Special characters (refGene annotation) 5159 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5160 5161 # Command - Clean empty fields (with value ".") 5162 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5163 5164 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5165 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5166 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5167 # for ann in annotation_renamed_list: 5168 for ann in annotation_list: 5169 annovar_fields_to_keep.append(f"^INFO/{ann}") 5170 5171 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5172 5173 # Command - indexing 5174 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5175 5176 log.debug(f"Annotation - Annovar command: {command_annovar}") 5177 run_parallel_commands([command_annovar], 1) 5178 5179 # Error messages 5180 log.info(f"Error/Warning messages:") 5181 error_message_command_all = [] 5182 error_message_command_warning = [] 5183 error_message_command_err = [] 5184 for err_file in err_files: 5185 with open(err_file, "r") as f: 5186 for line in f: 5187 message = line.strip() 5188 error_message_command_all.append(message) 5189 if line.startswith("[W::") or line.startswith("WARNING"): 5190 error_message_command_warning.append(message) 5191 if line.startswith("[E::") or line.startswith("ERROR"): 5192 error_message_command_err.append( 5193 f"{err_file}: " + message 5194 ) 5195 # log info 5196 for message in list( 5197 set(error_message_command_err + error_message_command_warning) 5198 ): 5199 log.info(f" {message}") 5200 # debug info 5201 for message in list(set(error_message_command_all)): 5202 log.debug(f" {message}") 5203 # failed 5204 if len(error_message_command_err): 5205 log.error("Annotation failed: Error in commands") 5206 raise ValueError("Annotation failed: Error in commands") 5207 5208 if tmp_annotates_vcf_name_list: 5209 5210 # List of annotated files 5211 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5212 5213 # Tmp file 5214 tmp_annotate_vcf = NamedTemporaryFile( 5215 prefix=self.get_prefix(), 5216 dir=self.get_tmp_dir(), 5217 suffix=".vcf.gz", 5218 delete=False, 5219 ) 5220 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5221 tmp_files.append(tmp_annotate_vcf_name) 5222 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5223 err_files.append(tmp_annotate_vcf_name_err) 5224 tmp_files.append(tmp_annotate_vcf_name_err) 5225 5226 # Command merge 5227 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5228 log.info( 5229 f"Annotation Annovar - Annotation merging " 5230 + str(len(tmp_annotates_vcf_name_list)) 5231 + " annotated files" 5232 ) 5233 log.debug(f"Annotation - merge command: {merge_command}") 5234 run_parallel_commands([merge_command], 1) 5235 5236 # Find annotation in header 5237 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5238 header_list = self.read_vcf_header(f) 5239 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5240 5241 for ann in annovar_vcf_header.infos: 5242 if ann not in self.get_header().infos: 5243 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5244 5245 # Update variants 5246 log.info(f"Annotation Annovar - Updating...") 5247 self.update_from_vcf(tmp_annotate_vcf_name) 5248 5249 # Clean files 5250 # Tmp file remove command 5251 if True: 5252 tmp_files_remove_command = "" 5253 if tmp_files: 5254 tmp_files_remove_command = " ".join(tmp_files) 5255 clean_command = f" rm -f {tmp_files_remove_command} " 5256 log.debug(f"Annotation Annovar - Annotation cleaning ") 5257 log.debug(f"Annotation - cleaning command: {clean_command}") 5258 run_parallel_commands([clean_command], 1)
It takes a VCF file, annotates it with Annovar, and then updates the database with the new annotations
Parameters
- threads: number of threads to use
Returns
the value of the variable "return_value".
5261 def annotation_parquet(self, threads: int = None) -> None: 5262 """ 5263 It takes a VCF file, and annotates it with a parquet file 5264 5265 :param threads: number of threads to use for the annotation 5266 :return: the value of the variable "result". 5267 """ 5268 5269 # DEBUG 5270 log.debug("Start annotation with parquet databases") 5271 5272 # Threads 5273 if not threads: 5274 threads = self.get_threads() 5275 log.debug("Threads: " + str(threads)) 5276 5277 # DEBUG 5278 delete_tmp = True 5279 if self.get_config().get("verbosity", "warning") in ["debug"]: 5280 delete_tmp = False 5281 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5282 5283 # Config 5284 databases_folders = set( 5285 self.get_config() 5286 .get("folders", {}) 5287 .get("databases", {}) 5288 .get("annotations", ["."]) 5289 + self.get_config() 5290 .get("folders", {}) 5291 .get("databases", {}) 5292 .get("parquet", ["."]) 5293 ) 5294 log.debug("Databases annotations: " + str(databases_folders)) 5295 5296 # Param 5297 annotations = ( 5298 self.get_param() 5299 .get("annotation", {}) 5300 .get("parquet", {}) 5301 .get("annotations", None) 5302 ) 5303 log.debug("Annotations: " + str(annotations)) 5304 5305 # Assembly 5306 assembly = self.get_param().get( 5307 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5308 ) 5309 5310 # Force Update Annotation 5311 force_update_annotation = ( 5312 self.get_param() 5313 .get("annotation", {}) 5314 .get("options", {}) 5315 .get("annotations_update", False) 5316 ) 5317 log.debug(f"force_update_annotation={force_update_annotation}") 5318 force_append_annotation = ( 5319 self.get_param() 5320 .get("annotation", {}) 5321 .get("options", {}) 5322 .get("annotations_append", False) 5323 ) 5324 log.debug(f"force_append_annotation={force_append_annotation}") 5325 5326 # Data 5327 table_variants = self.get_table_variants() 5328 5329 # Check if not empty 5330 log.debug("Check if not empty") 5331 sql_query_chromosomes_df = self.get_query_to_df( 5332 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5333 ) 5334 if not sql_query_chromosomes_df["count"][0]: 5335 log.info(f"VCF empty") 5336 return 5337 5338 # VCF header 5339 vcf_reader = self.get_header() 5340 log.debug("Initial header: " + str(vcf_reader.infos)) 5341 5342 # Nb Variants POS 5343 log.debug("NB Variants Start") 5344 nb_variants = self.conn.execute( 5345 f"SELECT count(*) AS count FROM variants" 5346 ).fetchdf()["count"][0] 5347 log.debug("NB Variants Stop") 5348 5349 # Existing annotations 5350 for vcf_annotation in self.get_header().infos: 5351 5352 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5353 log.debug( 5354 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5355 ) 5356 5357 # Added columns 5358 added_columns = [] 5359 5360 # drop indexes 5361 log.debug(f"Drop indexes...") 5362 self.drop_indexes() 5363 5364 if annotations: 5365 5366 if "ALL" in annotations: 5367 5368 all_param = annotations.get("ALL", {}) 5369 all_param_formats = all_param.get("formats", None) 5370 all_param_releases = all_param.get("releases", None) 5371 5372 databases_infos_dict = self.scan_databases( 5373 database_formats=all_param_formats, 5374 database_releases=all_param_releases, 5375 ) 5376 for database_infos in databases_infos_dict.keys(): 5377 if database_infos not in annotations: 5378 annotations[database_infos] = {"INFO": None} 5379 5380 for annotation in annotations: 5381 5382 if annotation in ["ALL"]: 5383 continue 5384 5385 # Annotation Name 5386 annotation_name = os.path.basename(annotation) 5387 5388 # Annotation fields 5389 annotation_fields = annotations[annotation] 5390 if not annotation_fields: 5391 annotation_fields = {"INFO": None} 5392 5393 log.debug(f"Annotation '{annotation_name}'") 5394 log.debug( 5395 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5396 ) 5397 5398 # Create Database 5399 database = Database( 5400 database=annotation, 5401 databases_folders=databases_folders, 5402 assembly=assembly, 5403 ) 5404 5405 # Find files 5406 parquet_file = database.get_database() 5407 parquet_hdr_file = database.get_header_file() 5408 parquet_type = database.get_type() 5409 5410 # Check if files exists 5411 if not parquet_file or not parquet_hdr_file: 5412 log.error("Annotation failed: file not found") 5413 raise ValueError("Annotation failed: file not found") 5414 else: 5415 # Get parquet connexion 5416 parquet_sql_attach = database.get_sql_database_attach( 5417 output="query" 5418 ) 5419 if parquet_sql_attach: 5420 self.conn.execute(parquet_sql_attach) 5421 parquet_file_link = database.get_sql_database_link() 5422 # Log 5423 log.debug( 5424 f"Annotation '{annotation_name}' - file: " 5425 + str(parquet_file) 5426 + " and " 5427 + str(parquet_hdr_file) 5428 ) 5429 5430 # Database full header columns 5431 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 5432 parquet_hdr_file 5433 ) 5434 # Log 5435 log.debug( 5436 "Annotation database header columns : " 5437 + str(parquet_hdr_vcf_header_columns) 5438 ) 5439 5440 # Load header as VCF object 5441 parquet_hdr_vcf_header_infos = database.get_header().infos 5442 # Log 5443 log.debug( 5444 "Annotation database header: " 5445 + str(parquet_hdr_vcf_header_infos) 5446 ) 5447 5448 # Get extra infos 5449 parquet_columns = database.get_extra_columns() 5450 # Log 5451 log.debug("Annotation database Columns: " + str(parquet_columns)) 5452 5453 # Add extra columns if "ALL" in annotation_fields 5454 # if "ALL" in annotation_fields: 5455 # allow_add_extra_column = True 5456 if "ALL" in annotation_fields and database.get_extra_columns(): 5457 for extra_column in database.get_extra_columns(): 5458 if ( 5459 extra_column not in annotation_fields 5460 and extra_column.replace("INFO/", "") 5461 not in parquet_hdr_vcf_header_infos 5462 ): 5463 parquet_hdr_vcf_header_infos[extra_column] = ( 5464 vcf.parser._Info( 5465 extra_column, 5466 ".", 5467 "String", 5468 f"{extra_column} description", 5469 "unknown", 5470 "unknown", 5471 self.code_type_map["String"], 5472 ) 5473 ) 5474 5475 # For all fields in database 5476 annotation_fields_all = False 5477 if "ALL" in annotation_fields or "INFO" in annotation_fields: 5478 annotation_fields_all = True 5479 annotation_fields = { 5480 key: key for key in parquet_hdr_vcf_header_infos 5481 } 5482 5483 log.debug( 5484 "Annotation database header - All annotations added: " 5485 + str(annotation_fields) 5486 ) 5487 5488 # Init 5489 5490 # List of annotation fields to use 5491 sql_query_annotation_update_info_sets = [] 5492 5493 # List of annotation to agregate 5494 sql_query_annotation_to_agregate = [] 5495 5496 # Number of fields 5497 nb_annotation_field = 0 5498 5499 # Annotation fields processed 5500 annotation_fields_processed = [] 5501 5502 # Columns mapping 5503 map_columns = database.map_columns( 5504 columns=annotation_fields, prefixes=["INFO/"] 5505 ) 5506 5507 # Query dict for fields to remove (update option) 5508 query_dict_remove = {} 5509 5510 # Fetch Anotation fields 5511 for annotation_field in annotation_fields: 5512 5513 # annotation_field_column 5514 annotation_field_column = map_columns.get( 5515 annotation_field, "INFO" 5516 ) 5517 5518 # field new name, if parametered 5519 annotation_fields_new_name = annotation_fields.get( 5520 annotation_field, annotation_field 5521 ) 5522 if not annotation_fields_new_name: 5523 annotation_fields_new_name = annotation_field 5524 5525 # To annotate 5526 # force_update_annotation = True 5527 # force_append_annotation = True 5528 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 5529 if annotation_field in parquet_hdr_vcf_header_infos and ( 5530 force_update_annotation 5531 or force_append_annotation 5532 or ( 5533 annotation_fields_new_name 5534 not in self.get_header().infos 5535 ) 5536 ): 5537 5538 # Add field to annotation to process list 5539 annotation_fields_processed.append( 5540 annotation_fields_new_name 5541 ) 5542 5543 # explode infos for the field 5544 annotation_fields_new_name_info_msg = "" 5545 if ( 5546 force_update_annotation 5547 and annotation_fields_new_name 5548 in self.get_header().infos 5549 ): 5550 # Remove field from INFO 5551 query = f""" 5552 UPDATE {table_variants} as table_variants 5553 SET INFO = REGEXP_REPLACE( 5554 concat(table_variants.INFO,''), 5555 ';*{annotation_fields_new_name}=[^;]*', 5556 '' 5557 ) 5558 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 5559 """ 5560 annotation_fields_new_name_info_msg = " [update]" 5561 query_dict_remove[ 5562 f"remove 'INFO/{annotation_fields_new_name}'" 5563 ] = query 5564 5565 # Sep between fields in INFO 5566 nb_annotation_field += 1 5567 if nb_annotation_field > 1: 5568 annotation_field_sep = ";" 5569 else: 5570 annotation_field_sep = "" 5571 5572 log.info( 5573 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 5574 ) 5575 5576 # Add INFO field to header 5577 parquet_hdr_vcf_header_infos_number = ( 5578 parquet_hdr_vcf_header_infos[annotation_field].num 5579 or "." 5580 ) 5581 parquet_hdr_vcf_header_infos_type = ( 5582 parquet_hdr_vcf_header_infos[annotation_field].type 5583 or "String" 5584 ) 5585 parquet_hdr_vcf_header_infos_description = ( 5586 parquet_hdr_vcf_header_infos[annotation_field].desc 5587 or f"{annotation_field} description" 5588 ) 5589 parquet_hdr_vcf_header_infos_source = ( 5590 parquet_hdr_vcf_header_infos[annotation_field].source 5591 or "unknown" 5592 ) 5593 parquet_hdr_vcf_header_infos_version = ( 5594 parquet_hdr_vcf_header_infos[annotation_field].version 5595 or "unknown" 5596 ) 5597 5598 vcf_reader.infos[annotation_fields_new_name] = ( 5599 vcf.parser._Info( 5600 annotation_fields_new_name, 5601 parquet_hdr_vcf_header_infos_number, 5602 parquet_hdr_vcf_header_infos_type, 5603 parquet_hdr_vcf_header_infos_description, 5604 parquet_hdr_vcf_header_infos_source, 5605 parquet_hdr_vcf_header_infos_version, 5606 self.code_type_map[ 5607 parquet_hdr_vcf_header_infos_type 5608 ], 5609 ) 5610 ) 5611 5612 # Append 5613 if force_append_annotation: 5614 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 5615 else: 5616 query_case_when_append = "" 5617 5618 # Annotation/Update query fields 5619 # Found in INFO column 5620 if ( 5621 annotation_field_column == "INFO" 5622 and "INFO" in parquet_hdr_vcf_header_columns 5623 ): 5624 sql_query_annotation_update_info_sets.append( 5625 f""" 5626 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 5627 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 5628 ELSE '' 5629 END 5630 """ 5631 ) 5632 # Found in a specific column 5633 else: 5634 sql_query_annotation_update_info_sets.append( 5635 f""" 5636 CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append} 5637 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ',')) 5638 ELSE '' 5639 END 5640 """ 5641 ) 5642 sql_query_annotation_to_agregate.append( 5643 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 5644 ) 5645 5646 # Not to annotate 5647 else: 5648 5649 if force_update_annotation: 5650 annotation_message = "forced" 5651 else: 5652 annotation_message = "skipped" 5653 5654 if annotation_field not in parquet_hdr_vcf_header_infos: 5655 log.warning( 5656 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 5657 ) 5658 if annotation_fields_new_name in self.get_header().infos: 5659 log.warning( 5660 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 5661 ) 5662 5663 # Check if ALL fields have to be annotated. Thus concat all INFO field 5664 # allow_annotation_full_info = True 5665 allow_annotation_full_info = not force_append_annotation 5666 5667 if parquet_type in ["regions"]: 5668 allow_annotation_full_info = False 5669 5670 if ( 5671 allow_annotation_full_info 5672 and nb_annotation_field == len(annotation_fields) 5673 and annotation_fields_all 5674 and ( 5675 "INFO" in parquet_hdr_vcf_header_columns 5676 and "INFO" in database.get_extra_columns() 5677 ) 5678 ): 5679 log.debug("Column INFO annotation enabled") 5680 sql_query_annotation_update_info_sets = [] 5681 sql_query_annotation_update_info_sets.append( 5682 f" table_parquet.INFO " 5683 ) 5684 5685 if sql_query_annotation_update_info_sets: 5686 5687 # Annotate 5688 log.info(f"Annotation '{annotation_name}' - Annotation...") 5689 5690 # Join query annotation update info sets for SQL 5691 sql_query_annotation_update_info_sets_sql = ",".join( 5692 sql_query_annotation_update_info_sets 5693 ) 5694 5695 # Check chromosomes list (and variants infos) 5696 sql_query_chromosomes = f""" 5697 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 5698 FROM {table_variants} as table_variants 5699 GROUP BY table_variants."#CHROM" 5700 ORDER BY table_variants."#CHROM" 5701 """ 5702 sql_query_chromosomes_df = self.conn.execute( 5703 sql_query_chromosomes 5704 ).df() 5705 sql_query_chromosomes_dict = { 5706 entry["CHROM"]: { 5707 "count": entry["count_variants"], 5708 "min": entry["min_variants"], 5709 "max": entry["max_variants"], 5710 } 5711 for index, entry in sql_query_chromosomes_df.iterrows() 5712 } 5713 5714 # Init 5715 nb_of_query = 0 5716 nb_of_variant_annotated = 0 5717 query_dict = query_dict_remove 5718 5719 # for chrom in sql_query_chromosomes_df["CHROM"]: 5720 for chrom in sql_query_chromosomes_dict: 5721 5722 # Number of variant by chromosome 5723 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 5724 chrom, {} 5725 ).get("count", 0) 5726 5727 log.debug( 5728 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 5729 ) 5730 5731 # Annotation with regions database 5732 if parquet_type in ["regions"]: 5733 sql_query_annotation_from_clause = f""" 5734 FROM ( 5735 SELECT 5736 '{chrom}' AS \"#CHROM\", 5737 table_variants_from.\"POS\" AS \"POS\", 5738 {",".join(sql_query_annotation_to_agregate)} 5739 FROM {table_variants} as table_variants_from 5740 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 5741 table_parquet_from."#CHROM" = '{chrom}' 5742 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 5743 AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1) 5744 OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 5745 ) 5746 ) 5747 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 5748 GROUP BY table_variants_from.\"POS\" 5749 ) 5750 as table_parquet 5751 """ 5752 5753 sql_query_annotation_where_clause = """ 5754 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5755 AND table_parquet.\"POS\" = table_variants.\"POS\" 5756 """ 5757 5758 # Annotation with variants database 5759 else: 5760 sql_query_annotation_from_clause = f""" 5761 FROM {parquet_file_link} as table_parquet 5762 """ 5763 sql_query_annotation_where_clause = f""" 5764 table_variants."#CHROM" = '{chrom}' 5765 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5766 AND table_parquet.\"POS\" = table_variants.\"POS\" 5767 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5768 AND table_parquet.\"REF\" = table_variants.\"REF\" 5769 """ 5770 5771 # Create update query 5772 sql_query_annotation_chrom_interval_pos = f""" 5773 UPDATE {table_variants} as table_variants 5774 SET INFO = 5775 concat( 5776 CASE WHEN table_variants.INFO NOT IN ('','.') 5777 THEN table_variants.INFO 5778 ELSE '' 5779 END 5780 , 5781 CASE WHEN table_variants.INFO NOT IN ('','.') 5782 AND ( 5783 concat({sql_query_annotation_update_info_sets_sql}) 5784 ) 5785 NOT IN ('','.') 5786 THEN ';' 5787 ELSE '' 5788 END 5789 , 5790 {sql_query_annotation_update_info_sets_sql} 5791 ) 5792 {sql_query_annotation_from_clause} 5793 WHERE {sql_query_annotation_where_clause} 5794 ; 5795 """ 5796 5797 # Add update query to dict 5798 query_dict[ 5799 f"{chrom} [{nb_of_variant_by_chrom} variants]" 5800 ] = sql_query_annotation_chrom_interval_pos 5801 5802 nb_of_query = len(query_dict) 5803 num_query = 0 5804 5805 # SET max_expression_depth TO x 5806 self.conn.execute("SET max_expression_depth TO 10000") 5807 5808 for query_name in query_dict: 5809 query = query_dict[query_name] 5810 num_query += 1 5811 log.info( 5812 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 5813 ) 5814 result = self.conn.execute(query) 5815 nb_of_variant_annotated_by_query = result.df()["Count"][0] 5816 nb_of_variant_annotated += nb_of_variant_annotated_by_query 5817 log.info( 5818 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 5819 ) 5820 5821 log.info( 5822 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 5823 ) 5824 5825 else: 5826 5827 log.info( 5828 f"Annotation '{annotation_name}' - No Annotations available" 5829 ) 5830 5831 log.debug("Final header: " + str(vcf_reader.infos)) 5832 5833 # Remove added columns 5834 for added_column in added_columns: 5835 self.drop_column(column=added_column)
It takes a VCF file, and annotates it with a parquet file
Parameters
- threads: number of threads to use for the annotation
Returns
the value of the variable "result".
5837 def annotation_splice(self, threads: int = None) -> None: 5838 """ 5839 This function annotate with snpEff 5840 5841 :param threads: The number of threads to use 5842 :return: the value of the variable "return_value". 5843 """ 5844 5845 # DEBUG 5846 log.debug("Start annotation with splice tools") 5847 5848 # Threads 5849 if not threads: 5850 threads = self.get_threads() 5851 log.debug("Threads: " + str(threads)) 5852 5853 # DEBUG 5854 delete_tmp = True 5855 if self.get_config().get("verbosity", "warning") in ["debug"]: 5856 delete_tmp = False 5857 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5858 5859 # Config 5860 config = self.get_config() 5861 log.debug("Config: " + str(config)) 5862 splice_config = config.get("tools", {}).get("splice", {}) 5863 if not splice_config: 5864 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 5865 if not splice_config: 5866 msg_err = "No Splice tool config" 5867 log.error(msg_err) 5868 raise ValueError(msg_err) 5869 log.debug(f"splice_config={splice_config}") 5870 5871 # Config - Folders - Databases 5872 databases_folders = ( 5873 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 5874 ) 5875 log.debug("Databases annotations: " + str(databases_folders)) 5876 5877 # Splice docker image 5878 splice_docker_image = splice_config.get("docker").get("image") 5879 5880 # Pull splice image if it's not already there 5881 if not check_docker_image_exists(splice_docker_image): 5882 log.warning( 5883 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 5884 ) 5885 try: 5886 command(f"docker pull {splice_config.get('docker').get('image')}") 5887 except subprocess.CalledProcessError: 5888 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 5889 log.error(msg_err) 5890 raise ValueError(msg_err) 5891 return None 5892 5893 # Config - splice databases 5894 splice_databases = ( 5895 config.get("folders", {}) 5896 .get("databases", {}) 5897 .get("splice", DEFAULT_SPLICE_FOLDER) 5898 ) 5899 splice_databases = full_path(splice_databases) 5900 5901 # Param 5902 param = self.get_param() 5903 log.debug("Param: " + str(param)) 5904 5905 # Param 5906 options = param.get("annotation", {}).get("splice", {}) 5907 log.debug("Options: " + str(options)) 5908 5909 # Data 5910 table_variants = self.get_table_variants() 5911 5912 # Check if not empty 5913 log.debug("Check if not empty") 5914 sql_query_chromosomes = ( 5915 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5916 ) 5917 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5918 log.info("VCF empty") 5919 return None 5920 5921 # Export in VCF 5922 log.debug("Create initial file to annotate") 5923 5924 # Create output folder 5925 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 5926 if not os.path.exists(output_folder): 5927 Path(output_folder).mkdir(parents=True, exist_ok=True) 5928 5929 # Create tmp VCF file 5930 tmp_vcf = NamedTemporaryFile( 5931 prefix=self.get_prefix(), 5932 dir=output_folder, 5933 suffix=".vcf", 5934 delete=False, 5935 ) 5936 tmp_vcf_name = tmp_vcf.name 5937 5938 # VCF header 5939 header = self.get_header() 5940 5941 # Existing annotations 5942 for vcf_annotation in self.get_header().infos: 5943 5944 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5945 log.debug( 5946 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5947 ) 5948 5949 # Memory limit 5950 if config.get("memory", None): 5951 memory_limit = config.get("memory", "8G").upper() 5952 # upper() 5953 else: 5954 memory_limit = "8G" 5955 log.debug(f"memory_limit: {memory_limit}") 5956 5957 # Export VCF file 5958 self.export_variant_vcf( 5959 vcf_file=tmp_vcf_name, 5960 remove_info=True, 5961 add_samples=True, 5962 index=False, 5963 ) 5964 5965 # Create docker container and launch splice analysis 5966 if splice_config: 5967 5968 # Splice mount folders 5969 mount_folders = splice_config.get("mount", {}) 5970 5971 # Genome mount 5972 mount_folders[ 5973 config.get("folders", {}) 5974 .get("databases", {}) 5975 .get("genomes", DEFAULT_GENOME_FOLDER) 5976 ] = "ro" 5977 5978 # SpliceAI mount 5979 mount_folders[ 5980 config.get("folders", {}) 5981 .get("databases", {}) 5982 .get("spliceai", DEFAULT_SPLICEAI_FOLDER) 5983 ] = "ro" 5984 5985 # Genome mount 5986 mount_folders[ 5987 config.get("folders", {}) 5988 .get("databases", {}) 5989 .get("spip", DEFAULT_SPIP_FOLDER) 5990 ] = "ro" 5991 5992 # Mount folders 5993 mount = [] 5994 5995 # Config mount 5996 mount = [ 5997 f"-v {full_path(path)}:{full_path(path)}:{mode}" 5998 for path, mode in mount_folders.items() 5999 ] 6000 6001 if any(value for value in splice_config.values() if value is None): 6002 log.warning("At least one splice config parameter is empty") 6003 return None 6004 6005 # Params in splice nf 6006 def check_values(dico: dict): 6007 """ 6008 Ensure parameters for NF splice pipeline 6009 """ 6010 for key, val in dico.items(): 6011 if key == "genome": 6012 if any( 6013 assemb in options.get("genome", {}) 6014 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6015 ): 6016 yield f"--{key} hg19" 6017 elif any( 6018 assemb in options.get("genome", {}) 6019 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6020 ): 6021 yield f"--{key} hg38" 6022 elif ( 6023 (isinstance(val, str) and val) 6024 or isinstance(val, int) 6025 or isinstance(val, bool) 6026 ): 6027 yield f"--{key} {val}" 6028 6029 # Genome 6030 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6031 options["genome"] = genome 6032 6033 # NF params 6034 nf_params = [] 6035 6036 # Add options 6037 if options: 6038 nf_params = list(check_values(options)) 6039 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6040 else: 6041 log.debug("No NF params provided") 6042 6043 # Add threads 6044 if "threads" not in options.keys(): 6045 nf_params.append(f"--threads {threads}") 6046 6047 # Genome path 6048 genome_path = find_genome( 6049 config.get("folders", {}) 6050 .get("databases", {}) 6051 .get("genomes", DEFAULT_GENOME_FOLDER), 6052 file=f"{genome}.fa", 6053 ) 6054 # Add genome path 6055 if not genome_path: 6056 raise ValueError( 6057 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6058 ) 6059 else: 6060 log.debug(f"Genome: {genome_path}") 6061 nf_params.append(f"--genome_path {genome_path}") 6062 6063 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6064 """ 6065 Setting up updated databases for SPiP and SpliceAI 6066 """ 6067 6068 try: 6069 6070 # SpliceAI assembly transcriptome 6071 spliceai_assembly = os.path.join( 6072 config.get("folders", {}) 6073 .get("databases", {}) 6074 .get("spliceai", {}), 6075 options.get("genome"), 6076 "transcriptome", 6077 ) 6078 spip_assembly = options.get("genome") 6079 6080 spip = find( 6081 f"transcriptome_{spip_assembly}.RData", 6082 config.get("folders", {}).get("databases", {}).get("spip", {}), 6083 ) 6084 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6085 log.debug(f"SPiP annotations: {spip}") 6086 log.debug(f"SpliceAI annotations: {spliceai}") 6087 if spip and spliceai: 6088 return [ 6089 f"--spip_transcriptome {spip}", 6090 f"--spliceai_annotations {spliceai}", 6091 ] 6092 else: 6093 # TODO crash and go on with basic annotations ? 6094 # raise ValueError( 6095 # "Can't find splice databases in configuration EXIT" 6096 # ) 6097 log.warning( 6098 "Can't find splice databases in configuration, use annotations file from image" 6099 ) 6100 except TypeError: 6101 log.warning( 6102 "Can't find splice databases in configuration, use annotations file from image" 6103 ) 6104 return [] 6105 6106 # Add options, check if transcriptome option have already beend provided 6107 if ( 6108 "spip_transcriptome" not in nf_params 6109 and "spliceai_transcriptome" not in nf_params 6110 ): 6111 splice_reference = splice_annotations(options, config) 6112 if splice_reference: 6113 nf_params.extend(splice_reference) 6114 6115 nf_params.append(f"--output_folder {output_folder}") 6116 6117 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6118 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6119 log.debug(cmd) 6120 6121 splice_config["docker"]["command"] = cmd 6122 6123 docker_cmd = get_bin_command( 6124 tool="splice", 6125 bin_type="docker", 6126 config=config, 6127 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6128 add_options=f"--name {random_uuid} {' '.join(mount)}", 6129 ) 6130 6131 # Docker debug 6132 # if splice_config.get("rm_container"): 6133 # rm_container = "--rm" 6134 # else: 6135 # rm_container = "" 6136 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6137 6138 log.debug(docker_cmd) 6139 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6140 log.debug(res.stdout) 6141 if res.stderr: 6142 log.error(res.stderr) 6143 res.check_returncode() 6144 else: 6145 log.warning(f"Splice tool configuration not found: {config}") 6146 6147 # Update variants 6148 log.info("Annotation - Updating...") 6149 # Test find output vcf 6150 log.debug( 6151 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6152 ) 6153 output_vcf = [] 6154 # Wrong folder to look in 6155 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6156 if ( 6157 files 6158 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6159 ): 6160 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6161 # log.debug(os.listdir(options.get("output_folder"))) 6162 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6163 if not output_vcf: 6164 log.debug( 6165 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6166 ) 6167 else: 6168 # Get new header from annotated vcf 6169 log.debug(f"Initial header: {len(header.infos)} fields") 6170 # Create new header with splice infos 6171 new_vcf = Variants(input=output_vcf[0]) 6172 new_vcf_header = new_vcf.get_header().infos 6173 for keys, infos in new_vcf_header.items(): 6174 if keys not in header.infos.keys(): 6175 header.infos[keys] = infos 6176 log.debug(f"New header: {len(header.infos)} fields") 6177 log.debug(f"Splice tmp output: {output_vcf[0]}") 6178 self.update_from_vcf(output_vcf[0]) 6179 6180 # Remove folder 6181 remove_if_exists(output_folder)
This function annotate with snpEff
Parameters
- threads: The number of threads to use
Returns
the value of the variable "return_value".
6187 def get_config_default(self, name: str) -> dict: 6188 """ 6189 The function `get_config_default` returns a dictionary containing default configurations for 6190 various calculations and prioritizations. 6191 6192 :param name: The `get_config_default` function returns a dictionary containing default 6193 configurations for different calculations and prioritizations. The `name` parameter is used to 6194 specify which specific configuration to retrieve from the dictionary 6195 :type name: str 6196 :return: The function `get_config_default` returns a dictionary containing default configuration 6197 settings for different calculations and prioritizations. The specific configuration settings are 6198 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6199 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6200 returned. If there is no match, an empty dictionary is returned. 6201 """ 6202 6203 config_default = { 6204 "calculations": { 6205 "variant_chr_pos_alt_ref": { 6206 "type": "sql", 6207 "name": "variant_chr_pos_alt_ref", 6208 "description": "Create a variant ID with chromosome, position, alt and ref", 6209 "available": False, 6210 "output_column_name": "variant_chr_pos_alt_ref", 6211 "output_column_type": "String", 6212 "output_column_description": "variant ID with chromosome, position, alt and ref", 6213 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6214 "operation_info": True, 6215 }, 6216 "VARTYPE": { 6217 "type": "sql", 6218 "name": "VARTYPE", 6219 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6220 "available": True, 6221 "output_column_name": "VARTYPE", 6222 "output_column_type": "String", 6223 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6224 "operation_query": """ 6225 CASE 6226 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6227 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6228 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6229 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6230 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6231 ELSE 'UNDEFINED' 6232 END 6233 """, 6234 "info_fields": ["SVTYPE"], 6235 "operation_info": True, 6236 }, 6237 "snpeff_hgvs": { 6238 "type": "python", 6239 "name": "snpeff_hgvs", 6240 "description": "HGVS nomenclatures from snpEff annotation", 6241 "available": True, 6242 "function_name": "calculation_extract_snpeff_hgvs", 6243 "function_params": [], 6244 }, 6245 "NOMEN": { 6246 "type": "python", 6247 "name": "NOMEN", 6248 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field", 6249 "available": True, 6250 "function_name": "calculation_extract_nomen", 6251 "function_params": [], 6252 }, 6253 "FINDBYPIPELINE": { 6254 "type": "python", 6255 "name": "FINDBYPIPELINE", 6256 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6257 "available": True, 6258 "function_name": "calculation_find_by_pipeline", 6259 "function_params": ["findbypipeline"], 6260 }, 6261 "FINDBYSAMPLE": { 6262 "type": "python", 6263 "name": "FINDBYSAMPLE", 6264 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6265 "available": True, 6266 "function_name": "calculation_find_by_pipeline", 6267 "function_params": ["findbysample"], 6268 }, 6269 "GENOTYPECONCORDANCE": { 6270 "type": "python", 6271 "name": "GENOTYPECONCORDANCE", 6272 "description": "Concordance of genotype for multi caller VCF", 6273 "available": True, 6274 "function_name": "calculation_genotype_concordance", 6275 "function_params": [], 6276 }, 6277 "BARCODE": { 6278 "type": "python", 6279 "name": "BARCODE", 6280 "description": "BARCODE as VaRank tool", 6281 "available": True, 6282 "function_name": "calculation_barcode", 6283 "function_params": [], 6284 }, 6285 "BARCODEFAMILY": { 6286 "type": "python", 6287 "name": "BARCODEFAMILY", 6288 "description": "BARCODEFAMILY as VaRank tool", 6289 "available": True, 6290 "function_name": "calculation_barcode_family", 6291 "function_params": ["BCF"], 6292 }, 6293 "TRIO": { 6294 "type": "python", 6295 "name": "TRIO", 6296 "description": "Inheritance for a trio family", 6297 "available": True, 6298 "function_name": "calculation_trio", 6299 "function_params": [], 6300 }, 6301 "VAF": { 6302 "type": "python", 6303 "name": "VAF", 6304 "description": "Variant Allele Frequency (VAF) harmonization", 6305 "available": True, 6306 "function_name": "calculation_vaf_normalization", 6307 "function_params": [], 6308 }, 6309 "VAF_stats": { 6310 "type": "python", 6311 "name": "VAF_stats", 6312 "description": "Variant Allele Frequency (VAF) statistics", 6313 "available": True, 6314 "function_name": "calculation_genotype_stats", 6315 "function_params": ["VAF"], 6316 }, 6317 "DP_stats": { 6318 "type": "python", 6319 "name": "DP_stats", 6320 "description": "Depth (DP) statistics", 6321 "available": True, 6322 "function_name": "calculation_genotype_stats", 6323 "function_params": ["DP"], 6324 }, 6325 "variant_id": { 6326 "type": "python", 6327 "name": "variant_id", 6328 "description": "Variant ID generated from variant position and type", 6329 "available": True, 6330 "function_name": "calculation_variant_id", 6331 "function_params": [], 6332 }, 6333 }, 6334 "prioritizations": { 6335 "default": { 6336 "filter": [ 6337 { 6338 "type": "notequals", 6339 "value": "!PASS|\\.", 6340 "score": 0, 6341 "flag": "FILTERED", 6342 "comment": ["Bad variant quality"], 6343 }, 6344 { 6345 "type": "equals", 6346 "value": "REJECT", 6347 "score": -20, 6348 "flag": "PASS", 6349 "comment": ["Bad variant quality"], 6350 }, 6351 ], 6352 "DP": [ 6353 { 6354 "type": "gte", 6355 "value": "50", 6356 "score": 5, 6357 "flag": "PASS", 6358 "comment": ["DP higher than 50"], 6359 } 6360 ], 6361 "ANN": [ 6362 { 6363 "type": "contains", 6364 "value": "HIGH", 6365 "score": 5, 6366 "flag": "PASS", 6367 "comment": [ 6368 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6369 ], 6370 }, 6371 { 6372 "type": "contains", 6373 "value": "MODERATE", 6374 "score": 3, 6375 "flag": "PASS", 6376 "comment": [ 6377 "A non-disruptive variant that might change protein effectiveness" 6378 ], 6379 }, 6380 { 6381 "type": "contains", 6382 "value": "LOW", 6383 "score": 0, 6384 "flag": "FILTERED", 6385 "comment": [ 6386 "Assumed to be mostly harmless or unlikely to change protein behavior" 6387 ], 6388 }, 6389 { 6390 "type": "contains", 6391 "value": "MODIFIER", 6392 "score": 0, 6393 "flag": "FILTERED", 6394 "comment": [ 6395 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 6396 ], 6397 }, 6398 ], 6399 } 6400 }, 6401 } 6402 6403 return config_default.get(name, None)
The function get_config_default returns a dictionary containing default configurations for
various calculations and prioritizations.
Parameters
- name: The
get_config_defaultfunction returns a dictionary containing default configurations for different calculations and prioritizations. Thenameparameter is used to specify which specific configuration to retrieve from the dictionary
Returns
The function
get_config_defaultreturns a dictionary containing default configuration settings for different calculations and prioritizations. The specific configuration settings are retrieved based on the inputnameparameter provided to the function. If thenameparameter matches a key in theconfig_defaultdictionary, the corresponding configuration settings are returned. If there is no match, an empty dictionary is returned.
6405 def get_config_json( 6406 self, name: str, config_dict: dict = {}, config_file: str = None 6407 ) -> dict: 6408 """ 6409 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 6410 default values, a dictionary, and a file. 6411 6412 :param name: The `name` parameter in the `get_config_json` function is a string that represents 6413 the name of the configuration. It is used to identify and retrieve the configuration settings 6414 for a specific component or module 6415 :type name: str 6416 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 6417 dictionary that allows you to provide additional configuration settings or overrides. When you 6418 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 6419 the key is the configuration setting you want to override or 6420 :type config_dict: dict 6421 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 6422 specify the path to a configuration file that contains additional settings. If provided, the 6423 function will read the contents of this file and update the configuration dictionary with the 6424 values found in the file, overriding any existing values with the 6425 :type config_file: str 6426 :return: The function `get_config_json` returns a dictionary containing the configuration 6427 settings. 6428 """ 6429 6430 # Create with default prioritizations 6431 config_default = self.get_config_default(name=name) 6432 configuration = config_default 6433 # log.debug(f"configuration={configuration}") 6434 6435 # Replace prioritizations from dict 6436 for config in config_dict: 6437 configuration[config] = config_dict[config] 6438 6439 # Replace prioritizations from file 6440 config_file = full_path(config_file) 6441 if config_file: 6442 if os.path.exists(config_file): 6443 with open(config_file) as config_file_content: 6444 config_file_dict = json.load(config_file_content) 6445 for config in config_file_dict: 6446 configuration[config] = config_file_dict[config] 6447 else: 6448 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 6449 log.error(msg_error) 6450 raise ValueError(msg_error) 6451 6452 return configuration
The function get_config_json retrieves a configuration JSON object with prioritizations from
default values, a dictionary, and a file.
Parameters
- name: The
nameparameter in theget_config_jsonfunction is a string that represents the name of the configuration. It is used to identify and retrieve the configuration settings for a specific component or module - config_dict: The
config_dictparameter in theget_config_jsonfunction is a dictionary that allows you to provide additional configuration settings or overrides. When you call theget_config_jsonfunction, you can pass a dictionary containing key-value pairs where the key is the configuration setting you want to override or - config_file: The
config_fileparameter in theget_config_jsonfunction is used to specify the path to a configuration file that contains additional settings. If provided, the function will read the contents of this file and update the configuration dictionary with the values found in the file, overriding any existing values with the
Returns
The function
get_config_jsonreturns a dictionary containing the configuration settings.
6477 def prioritization(self) -> None: 6478 """ 6479 It takes a VCF file, and adds a bunch of new INFO fields to it, based on the values of other 6480 INFO fields 6481 """ 6482 6483 # Config 6484 config = self.get_config() 6485 6486 # Param 6487 param = self.get_param() 6488 6489 # Quick Prioritizations 6490 # prioritizations = param.get("prioritization", {}).get("prioritizations", "") 6491 6492 # Configuration profiles 6493 prioritization_config_file = param.get("prioritization", {}).get( 6494 "prioritization_config", None 6495 ) 6496 prioritization_config_file = full_path(prioritization_config_file) 6497 prioritizations_config = self.get_config_json( 6498 name="prioritizations", config_file=prioritization_config_file 6499 ) 6500 6501 # Prioritization options 6502 profiles = param.get("prioritization", {}).get("profiles", []) 6503 if isinstance(profiles, str): 6504 profiles = profiles.split(",") 6505 pzfields = param.get("prioritization", {}).get( 6506 "pzfields", ["PZFlag", "PZScore"] 6507 ) 6508 if isinstance(pzfields, str): 6509 pzfields = pzfields.split(",") 6510 default_profile = param.get("prioritization", {}).get("default_profile", None) 6511 pzfields_sep = param.get("prioritization", {}).get("pzfields_sep", "_") 6512 prioritization_score_mode = param.get("prioritization", {}).get( 6513 "prioritization_score_mode", "HOWARD" 6514 ) 6515 6516 # Quick Prioritizations 6517 # prioritizations = param.get("prioritization", {}).get("prioritizations", None) 6518 prioritizations = param.get("prioritizations", None) 6519 if prioritizations: 6520 log.info("Quick Prioritization:") 6521 for profile in prioritizations.split(","): 6522 if profile not in profiles: 6523 profiles.append(profile) 6524 log.info(f" {profile}") 6525 6526 # If profile "ALL" provided, all profiles in the config profiles 6527 if "ALL" in profiles: 6528 profiles = list(prioritizations_config.keys()) 6529 6530 for profile in profiles: 6531 if prioritizations_config.get(profile, None): 6532 log.debug(f"Profile '{profile}' configured") 6533 else: 6534 msg_error = f"Profile '{profile}' NOT configured" 6535 log.error(msg_error) 6536 raise ValueError(msg_error) 6537 6538 if profiles: 6539 log.info(f"Prioritization... ") 6540 else: 6541 log.debug(f"No profile defined") 6542 return 6543 6544 if not default_profile and len(profiles): 6545 default_profile = profiles[0] 6546 6547 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 6548 log.debug("Profiles to check: " + str(list(profiles))) 6549 6550 # Variables 6551 table_variants = self.get_table_variants(clause="update") 6552 6553 # Added columns 6554 added_columns = [] 6555 6556 # Create list of PZfields 6557 # List of PZFields 6558 list_of_pzfields_original = pzfields + [ 6559 pzfield + pzfields_sep + profile 6560 for pzfield in pzfields 6561 for profile in profiles 6562 ] 6563 list_of_pzfields = [] 6564 log.debug(f"{list_of_pzfields_original}") 6565 6566 # Remove existing PZfields to use if exists 6567 for pzfield in list_of_pzfields_original: 6568 if self.get_header().infos.get(pzfield, None) is None: 6569 list_of_pzfields.append(pzfield) 6570 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 6571 else: 6572 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 6573 6574 if list_of_pzfields: 6575 6576 # Explode Infos fields 6577 explode_infos_prefix = self.get_explode_infos_prefix() 6578 added_columns += self.explode_infos(prefix=explode_infos_prefix) 6579 extra_infos = self.get_extra_infos() 6580 6581 # PZfields tags description 6582 PZfields_INFOS = { 6583 "PZTags": { 6584 "ID": "PZTags", 6585 "Number": ".", 6586 "Type": "String", 6587 "Description": "Variant tags based on annotation criteria", 6588 }, 6589 "PZScore": { 6590 "ID": "PZScore", 6591 "Number": 1, 6592 "Type": "Integer", 6593 "Description": "Variant score based on annotation criteria", 6594 }, 6595 "PZFlag": { 6596 "ID": "PZFlag", 6597 "Number": 1, 6598 "Type": "String", 6599 "Description": "Variant flag based on annotation criteria", 6600 }, 6601 "PZComment": { 6602 "ID": "PZComment", 6603 "Number": ".", 6604 "Type": "String", 6605 "Description": "Variant comment based on annotation criteria", 6606 }, 6607 "PZInfos": { 6608 "ID": "PZInfos", 6609 "Number": ".", 6610 "Type": "String", 6611 "Description": "Variant infos based on annotation criteria", 6612 }, 6613 } 6614 6615 # Create INFO fields if not exist 6616 for field in PZfields_INFOS: 6617 field_ID = PZfields_INFOS[field]["ID"] 6618 field_description = PZfields_INFOS[field]["Description"] 6619 if field_ID not in self.get_header().infos and field_ID in pzfields: 6620 field_description = ( 6621 PZfields_INFOS[field]["Description"] 6622 + f", profile {default_profile}" 6623 ) 6624 self.get_header().infos[field_ID] = vcf.parser._Info( 6625 field_ID, 6626 PZfields_INFOS[field]["Number"], 6627 PZfields_INFOS[field]["Type"], 6628 field_description, 6629 "unknown", 6630 "unknown", 6631 code_type_map[PZfields_INFOS[field]["Type"]], 6632 ) 6633 6634 # Create INFO fields if not exist for each profile 6635 for profile in prioritizations_config: 6636 if profile in profiles or profiles == []: 6637 for field in PZfields_INFOS: 6638 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 6639 field_description = ( 6640 PZfields_INFOS[field]["Description"] 6641 + f", profile {profile}" 6642 ) 6643 if ( 6644 field_ID not in self.get_header().infos 6645 and field in pzfields 6646 ): 6647 self.get_header().infos[field_ID] = vcf.parser._Info( 6648 field_ID, 6649 PZfields_INFOS[field]["Number"], 6650 PZfields_INFOS[field]["Type"], 6651 field_description, 6652 "unknown", 6653 "unknown", 6654 code_type_map[PZfields_INFOS[field]["Type"]], 6655 ) 6656 6657 # Header 6658 for pzfield in list_of_pzfields: 6659 if re.match("PZScore.*", pzfield): 6660 added_column = self.add_column( 6661 table_name=table_variants, 6662 column_name=pzfield, 6663 column_type="INTEGER", 6664 default_value="0", 6665 ) 6666 elif re.match("PZFlag.*", pzfield): 6667 added_column = self.add_column( 6668 table_name=table_variants, 6669 column_name=pzfield, 6670 column_type="BOOLEAN", 6671 default_value="1", 6672 ) 6673 else: 6674 added_column = self.add_column( 6675 table_name=table_variants, 6676 column_name=pzfield, 6677 column_type="STRING", 6678 default_value="''", 6679 ) 6680 added_columns.append(added_column) 6681 6682 # Profiles 6683 if profiles: 6684 6685 # foreach profile in configuration file 6686 for profile in prioritizations_config: 6687 6688 # If profile is asked in param, or ALL are asked (empty profile []) 6689 if profile in profiles or profiles == []: 6690 log.info(f"Profile '{profile}'") 6691 6692 sql_set_info_option = "" 6693 6694 sql_set_info = [] 6695 6696 # PZ fields set 6697 6698 # PZScore 6699 if f"PZScore{pzfields_sep}{profile}" in list_of_pzfields: 6700 sql_set_info.append( 6701 f""" 6702 concat( 6703 'PZScore{pzfields_sep}{profile}=', 6704 PZScore{pzfields_sep}{profile} 6705 ) 6706 """ 6707 ) 6708 if ( 6709 profile == default_profile 6710 and "PZScore" in list_of_pzfields 6711 ): 6712 sql_set_info.append( 6713 f""" 6714 concat( 6715 'PZScore=', 6716 PZScore{pzfields_sep}{profile} 6717 ) 6718 """ 6719 ) 6720 6721 # PZFlag 6722 if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields: 6723 sql_set_info.append( 6724 f""" 6725 concat( 6726 'PZFlag{pzfields_sep}{profile}=', 6727 CASE 6728 WHEN PZFlag{pzfields_sep}{profile}==1 6729 THEN 'PASS' 6730 WHEN PZFlag{pzfields_sep}{profile}==0 6731 THEN 'FILTERED' 6732 END 6733 ) 6734 """ 6735 ) 6736 if ( 6737 profile == default_profile 6738 and "PZFlag" in list_of_pzfields 6739 ): 6740 sql_set_info.append( 6741 f""" 6742 concat( 6743 'PZFlag=', 6744 CASE 6745 WHEN PZFlag{pzfields_sep}{profile}==1 6746 THEN 'PASS' 6747 WHEN PZFlag{pzfields_sep}{profile}==0 6748 THEN 'FILTERED' 6749 END 6750 ) 6751 """ 6752 ) 6753 6754 # PZComment 6755 if f"PZComment{pzfields_sep}{profile}" in list_of_pzfields: 6756 sql_set_info.append( 6757 f""" 6758 CASE 6759 WHEN PZComment{pzfields_sep}{profile} NOT IN ('') 6760 THEN concat('PZComment{pzfields_sep}{profile}=', PZComment{pzfields_sep}{profile}) 6761 ELSE '' 6762 END 6763 """ 6764 ) 6765 if ( 6766 profile == default_profile 6767 and "PZComment" in list_of_pzfields 6768 ): 6769 sql_set_info.append( 6770 f""" 6771 CASE 6772 WHEN PZComment{pzfields_sep}{profile} NOT IN ('') 6773 THEN concat('PZComment=', PZComment{pzfields_sep}{profile}) 6774 ELSE '' 6775 END 6776 """ 6777 ) 6778 6779 # PZInfos 6780 if f"PZInfos{pzfields_sep}{profile}" in list_of_pzfields: 6781 sql_set_info.append( 6782 f""" 6783 CASE 6784 WHEN PZInfos{pzfields_sep}{profile} NOT IN ('') 6785 THEN concat('PZInfos{pzfields_sep}{profile}=', PZInfos{pzfields_sep}{profile}) 6786 ELSE '' 6787 END 6788 """ 6789 ) 6790 if ( 6791 profile == default_profile 6792 and "PZInfos" in list_of_pzfields 6793 ): 6794 sql_set_info.append( 6795 f""" 6796 CASE 6797 WHEN PZInfos{pzfields_sep}{profile} NOT IN ('') 6798 THEN concat('PZInfos=', PZInfos{pzfields_sep}{profile}) 6799 ELSE '' 6800 END 6801 """ 6802 ) 6803 6804 # Merge PZfields 6805 sql_set_info_option = "" 6806 sql_set_sep = "" 6807 for sql_set in sql_set_info: 6808 if sql_set_sep: 6809 sql_set_info_option += f""" 6810 , concat('{sql_set_sep}', {sql_set}) 6811 """ 6812 else: 6813 sql_set_info_option += f""" 6814 , {sql_set} 6815 """ 6816 sql_set_sep = ";" 6817 6818 sql_queries = [] 6819 for annotation in prioritizations_config[profile]: 6820 6821 # Check if annotation field is present 6822 if not f"{explode_infos_prefix}{annotation}" in extra_infos: 6823 log.debug(f"Annotation '{annotation}' not in data") 6824 continue 6825 else: 6826 log.debug(f"Annotation '{annotation}' in data") 6827 6828 # For each criterions 6829 for criterion in prioritizations_config[profile][ 6830 annotation 6831 ]: 6832 criterion_type = criterion["type"] 6833 criterion_value = criterion["value"] 6834 criterion_score = criterion.get("score", 0) 6835 criterion_flag = criterion.get("flag", "PASS") 6836 criterion_flag_bool = criterion_flag == "PASS" 6837 criterion_comment = ( 6838 ", ".join(criterion.get("comment", [])) 6839 .replace("'", "''") 6840 .replace(";", ",") 6841 .replace("\t", " ") 6842 ) 6843 criterion_infos = ( 6844 str(criterion) 6845 .replace("'", "''") 6846 .replace(";", ",") 6847 .replace("\t", " ") 6848 ) 6849 6850 sql_set = [] 6851 sql_set_info = [] 6852 6853 # PZ fields set 6854 if ( 6855 f"PZScore{pzfields_sep}{profile}" 6856 in list_of_pzfields 6857 ): 6858 if prioritization_score_mode == "HOWARD": 6859 sql_set.append( 6860 f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}" 6861 ) 6862 elif prioritization_score_mode == "VaRank": 6863 sql_set.append( 6864 f"PZScore{pzfields_sep}{profile} = CASE WHEN {criterion_score}>PZScore{pzfields_sep}{profile} THEN {criterion_score} END" 6865 ) 6866 else: 6867 sql_set.append( 6868 f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}" 6869 ) 6870 if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields: 6871 sql_set.append( 6872 f"PZFlag{pzfields_sep}{profile} = PZFlag{pzfields_sep}{profile} AND {criterion_flag_bool}" 6873 ) 6874 if ( 6875 f"PZComment{pzfields_sep}{profile}" 6876 in list_of_pzfields 6877 ): 6878 sql_set.append( 6879 f""" 6880 PZComment{pzfields_sep}{profile} = 6881 concat( 6882 PZComment{pzfields_sep}{profile}, 6883 CASE 6884 WHEN PZComment{pzfields_sep}{profile}!='' 6885 THEN ', ' 6886 ELSE '' 6887 END, 6888 '{criterion_comment}' 6889 ) 6890 """ 6891 ) 6892 if ( 6893 f"PZInfos{pzfields_sep}{profile}" 6894 in list_of_pzfields 6895 ): 6896 sql_set.append( 6897 f""" 6898 PZInfos{pzfields_sep}{profile} = 6899 concat( 6900 PZInfos{pzfields_sep}{profile}, 6901 '{criterion_infos}' 6902 ) 6903 """ 6904 ) 6905 sql_set_option = ",".join(sql_set) 6906 6907 # Criterion and comparison 6908 try: 6909 float(criterion_value) 6910 sql_update = f""" 6911 UPDATE {table_variants} 6912 SET {sql_set_option} 6913 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 6914 AND "{explode_infos_prefix}{annotation}"{comparison_map[criterion_type]}{criterion_value} 6915 """ 6916 except: 6917 contains_option = "" 6918 if criterion_type == "contains": 6919 contains_option = ".*" 6920 sql_update = f""" 6921 UPDATE {table_variants} 6922 SET {sql_set_option} 6923 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 6924 """ 6925 sql_queries.append(sql_update) 6926 6927 # PZTags 6928 if f"PZTags{pzfields_sep}{profile}" in list_of_pzfields: 6929 6930 # Create PZFalgs value 6931 pztags_value = "" 6932 pztags_sep_default = "|" 6933 pztags_sep = "" 6934 for pzfield in pzfields: 6935 if pzfield not in ["PZTags"]: 6936 if ( 6937 f"{pzfield}{pzfields_sep}{profile}" 6938 in list_of_pzfields 6939 ): 6940 if pzfield in ["PZFlag"]: 6941 pztags_value += f"""{pztags_sep}{pzfield}#', 6942 CASE WHEN PZFlag{pzfields_sep}{profile} 6943 THEN 'PASS' 6944 ELSE 'FILTERED' 6945 END, '""" 6946 else: 6947 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 6948 pztags_sep = pztags_sep_default 6949 6950 # Add Query update for PZFlags 6951 sql_update_pztags = f""" 6952 UPDATE {table_variants} 6953 SET INFO = concat( 6954 INFO, 6955 CASE WHEN INFO NOT in ('','.') 6956 THEN ';' 6957 ELSE '' 6958 END, 6959 'PZTags{pzfields_sep}{profile}={pztags_value}' 6960 ) 6961 """ 6962 sql_queries.append(sql_update_pztags) 6963 6964 # Add Query update for PZFlags for default 6965 if profile == default_profile: 6966 sql_update_pztags_default = f""" 6967 UPDATE {table_variants} 6968 SET INFO = concat( 6969 INFO, 6970 ';', 6971 'PZTags={pztags_value}' 6972 ) 6973 """ 6974 sql_queries.append(sql_update_pztags_default) 6975 6976 log.info(f"""Profile '{profile}' - Prioritization... """) 6977 6978 if sql_queries: 6979 6980 for sql_query in sql_queries: 6981 log.debug( 6982 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 6983 ) 6984 self.conn.execute(sql_query) 6985 6986 log.info(f"""Profile '{profile}' - Update... """) 6987 sql_query_update = f""" 6988 UPDATE {table_variants} 6989 SET INFO = 6990 concat( 6991 CASE 6992 WHEN INFO NOT IN ('','.') 6993 THEN concat(INFO, ';') 6994 ELSE '' 6995 END 6996 {sql_set_info_option} 6997 ) 6998 """ 6999 self.conn.execute(sql_query_update) 7000 7001 else: 7002 7003 log.warning(f"No profiles in parameters") 7004 7005 # Remove added columns 7006 for added_column in added_columns: 7007 self.drop_column(column=added_column) 7008 7009 # Explode INFOS fields into table fields 7010 if self.get_explode_infos(): 7011 self.explode_infos( 7012 prefix=self.get_explode_infos_prefix(), 7013 fields=self.get_explode_infos_fields(), 7014 force=True, 7015 ) 7016 7017 return
It takes a VCF file, and adds a bunch of new INFO fields to it, based on the values of other INFO fields
7023 def annotation_hgvs(self, threads: int = None) -> None: 7024 """ 7025 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7026 coordinates and alleles. 7027 7028 :param threads: The `threads` parameter is an optional integer that specifies the number of 7029 threads to use for parallel processing. If no value is provided, it will default to the number 7030 of threads obtained from the `get_threads()` method 7031 :type threads: int 7032 """ 7033 7034 # Function for each partition of the Dask Dataframe 7035 def partition_function(partition): 7036 """ 7037 The function `partition_function` applies the `annotation_hgvs_partition` function to 7038 each row of a DataFrame called `partition`. 7039 7040 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7041 to be processed 7042 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7043 the "partition" dataframe along the axis 1. 7044 """ 7045 return partition.apply(annotation_hgvs_partition, axis=1) 7046 7047 def annotation_hgvs_partition(row) -> str: 7048 """ 7049 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7050 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7051 7052 :param row: A dictionary-like object that contains the values for the following keys: 7053 :return: a string that contains the HGVS names associated with the given row of data. 7054 """ 7055 7056 chr = row["CHROM"] 7057 pos = row["POS"] 7058 ref = row["REF"] 7059 alt = row["ALT"] 7060 7061 # Find list of associated transcripts 7062 transcripts_list = list( 7063 polars_conn.execute( 7064 f""" 7065 SELECT transcript 7066 FROM refseq_df 7067 WHERE CHROM='{chr}' 7068 AND POS={pos} 7069 """ 7070 )["transcript"] 7071 ) 7072 7073 # Full HGVS annotation in list 7074 hgvs_full_list = [] 7075 7076 for transcript_name in transcripts_list: 7077 7078 # Transcript 7079 transcript = get_transcript( 7080 transcripts=transcripts, transcript_name=transcript_name 7081 ) 7082 # Exon 7083 if use_exon: 7084 exon = transcript.find_exon_number(pos) 7085 else: 7086 exon = None 7087 # Protein 7088 transcript_protein = None 7089 if use_protein or add_protein or full_format: 7090 transcripts_protein = list( 7091 polars_conn.execute( 7092 f""" 7093 SELECT protein 7094 FROM refseqlink_df 7095 WHERE transcript='{transcript_name}' 7096 LIMIT 1 7097 """ 7098 )["protein"] 7099 ) 7100 if len(transcripts_protein): 7101 transcript_protein = transcripts_protein[0] 7102 7103 # HGVS name 7104 hgvs_name = format_hgvs_name( 7105 chr, 7106 pos, 7107 ref, 7108 alt, 7109 genome=genome, 7110 transcript=transcript, 7111 transcript_protein=transcript_protein, 7112 exon=exon, 7113 use_gene=use_gene, 7114 use_protein=use_protein, 7115 full_format=full_format, 7116 use_version=use_version, 7117 codon_type=codon_type, 7118 ) 7119 hgvs_full_list.append(hgvs_name) 7120 if add_protein and not use_protein and not full_format: 7121 hgvs_name = format_hgvs_name( 7122 chr, 7123 pos, 7124 ref, 7125 alt, 7126 genome=genome, 7127 transcript=transcript, 7128 transcript_protein=transcript_protein, 7129 exon=exon, 7130 use_gene=use_gene, 7131 use_protein=True, 7132 full_format=False, 7133 use_version=use_version, 7134 codon_type=codon_type, 7135 ) 7136 hgvs_full_list.append(hgvs_name) 7137 7138 # Create liste of HGVS annotations 7139 hgvs_full = ",".join(hgvs_full_list) 7140 7141 return hgvs_full 7142 7143 # Polars connexion 7144 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7145 7146 # Config 7147 config = self.get_config() 7148 7149 # Databases 7150 # Genome 7151 databases_genomes_folders = ( 7152 config.get("folders", {}) 7153 .get("databases", {}) 7154 .get("genomes", DEFAULT_GENOME_FOLDER) 7155 ) 7156 databases_genome = ( 7157 config.get("folders", {}).get("databases", {}).get("genomes", "") 7158 ) 7159 # refseq database folder 7160 databases_refseq_folders = ( 7161 config.get("folders", {}) 7162 .get("databases", {}) 7163 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7164 ) 7165 # refseq 7166 databases_refseq = config.get("databases", {}).get("refSeq", None) 7167 # refSeqLink 7168 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7169 7170 # Param 7171 param = self.get_param() 7172 7173 # Quick HGVS 7174 if "hgvs_options" in param and param.get("hgvs_options", ""): 7175 log.info(f"Quick HGVS Annotation:") 7176 if not param.get("hgvs", None): 7177 param["hgvs"] = {} 7178 for option in param.get("hgvs_options", "").split(","): 7179 option_var_val = option.split("=") 7180 option_var = option_var_val[0] 7181 if len(option_var_val) > 1: 7182 option_val = option_var_val[1] 7183 else: 7184 option_val = "True" 7185 if option_val.upper() in ["TRUE"]: 7186 option_val = True 7187 elif option_val.upper() in ["FALSE"]: 7188 option_val = False 7189 log.info(f" {option_var}={option_val}") 7190 param["hgvs"][option_var] = option_val 7191 7192 # Check if HGVS annotation enabled 7193 if "hgvs" in param: 7194 log.info(f"HGVS Annotation... ") 7195 for hgvs_option in param.get("hgvs", {}): 7196 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7197 else: 7198 return 7199 7200 # HGVS Param 7201 param_hgvs = param.get("hgvs", {}) 7202 use_exon = param_hgvs.get("use_exon", False) 7203 use_gene = param_hgvs.get("use_gene", False) 7204 use_protein = param_hgvs.get("use_protein", False) 7205 add_protein = param_hgvs.get("add_protein", False) 7206 full_format = param_hgvs.get("full_format", False) 7207 use_version = param_hgvs.get("use_version", False) 7208 codon_type = param_hgvs.get("codon_type", "3") 7209 7210 # refSseq refSeqLink 7211 databases_refseq = param_hgvs.get("refseq", databases_refseq) 7212 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 7213 7214 # Assembly 7215 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 7216 7217 # Genome 7218 genome_file = None 7219 if find_genome(databases_genome): 7220 genome_file = find_genome(databases_genome) 7221 else: 7222 genome_file = find_genome( 7223 genome_path=databases_genomes_folders, assembly=assembly 7224 ) 7225 log.debug("Genome: " + str(genome_file)) 7226 7227 # refSseq 7228 refseq_file = find_file_prefix( 7229 input_file=databases_refseq, 7230 prefix="ncbiRefSeq", 7231 folder=databases_refseq_folders, 7232 assembly=assembly, 7233 ) 7234 log.debug("refSeq: " + str(refseq_file)) 7235 7236 # refSeqLink 7237 refseqlink_file = find_file_prefix( 7238 input_file=databases_refseqlink, 7239 prefix="ncbiRefSeqLink", 7240 folder=databases_refseq_folders, 7241 assembly=assembly, 7242 ) 7243 log.debug("refSeqLink: " + str(refseqlink_file)) 7244 7245 # Threads 7246 if not threads: 7247 threads = self.get_threads() 7248 log.debug("Threads: " + str(threads)) 7249 7250 # Variables 7251 table_variants = self.get_table_variants(clause="update") 7252 7253 # Get variants SNV and InDel only 7254 query_variants = f""" 7255 SELECT "#CHROM" AS CHROM, POS, REF, ALT 7256 FROM {table_variants} 7257 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 7258 """ 7259 df_variants = self.get_query_to_df(query_variants) 7260 7261 # Added columns 7262 added_columns = [] 7263 7264 # Add hgvs column in variants table 7265 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 7266 added_column = self.add_column( 7267 table_variants, hgvs_column_name, "STRING", default_value=None 7268 ) 7269 added_columns.append(added_column) 7270 7271 log.debug(f"refSeq loading...") 7272 # refSeq in duckDB 7273 refseq_table = get_refseq_table( 7274 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 7275 ) 7276 # Loading all refSeq in Dataframe 7277 refseq_query = f""" 7278 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 7279 FROM {refseq_table} 7280 JOIN df_variants ON ( 7281 {refseq_table}.chrom = df_variants.CHROM 7282 AND {refseq_table}.txStart<=df_variants.POS 7283 AND {refseq_table}.txEnd>=df_variants.POS 7284 ) 7285 """ 7286 refseq_df = self.conn.query(refseq_query).pl() 7287 7288 if refseqlink_file: 7289 log.debug(f"refSeqLink loading...") 7290 # refSeqLink in duckDB 7291 refseqlink_table = get_refseq_table( 7292 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 7293 ) 7294 # Loading all refSeqLink in Dataframe 7295 protacc_column = "protAcc_with_ver" 7296 mrnaacc_column = "mrnaAcc_with_ver" 7297 refseqlink_query = f""" 7298 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 7299 FROM {refseqlink_table} 7300 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 7301 WHERE protAcc_without_ver IS NOT NULL 7302 """ 7303 # Polars Dataframe 7304 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 7305 7306 # Read RefSeq transcripts into a python dict/model. 7307 log.debug(f"Transcripts loading...") 7308 with tempfile.TemporaryDirectory() as tmpdir: 7309 transcripts_query = f""" 7310 COPY ( 7311 SELECT {refseq_table}.* 7312 FROM {refseq_table} 7313 JOIN df_variants ON ( 7314 {refseq_table}.chrom=df_variants.CHROM 7315 AND {refseq_table}.txStart<=df_variants.POS 7316 AND {refseq_table}.txEnd>=df_variants.POS 7317 ) 7318 ) 7319 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 7320 """ 7321 self.conn.query(transcripts_query) 7322 with open(f"{tmpdir}/transcript.tsv") as infile: 7323 transcripts = read_transcripts(infile) 7324 7325 # Polars connexion 7326 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7327 7328 log.debug("Genome loading...") 7329 # Read genome sequence using pyfaidx. 7330 genome = Fasta(genome_file) 7331 7332 log.debug("Start annotation HGVS...") 7333 7334 # Create 7335 # a Dask Dataframe from Pandas dataframe with partition as number of threads 7336 ddf = dd.from_pandas(df_variants, npartitions=threads) 7337 7338 # Use dask.dataframe.apply() to apply function on each partition 7339 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 7340 7341 # Convert Dask DataFrame to Pandas Dataframe 7342 df = ddf.compute() 7343 7344 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 7345 with tempfile.TemporaryDirectory() as tmpdir: 7346 df_parquet = os.path.join(tmpdir, "df.parquet") 7347 df.to_parquet(df_parquet) 7348 7349 # Update hgvs column 7350 update_variant_query = f""" 7351 UPDATE {table_variants} 7352 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 7353 FROM read_parquet('{df_parquet}') as df 7354 WHERE variants."#CHROM" = df.CHROM 7355 AND variants.POS = df.POS 7356 AND variants.REF = df.REF 7357 AND variants.ALT = df.ALT 7358 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 7359 """ 7360 self.execute_query(update_variant_query) 7361 7362 # Update INFO column 7363 sql_query_update = f""" 7364 UPDATE {table_variants} 7365 SET INFO = 7366 concat( 7367 CASE 7368 WHEN INFO NOT IN ('','.') 7369 THEN concat(INFO, ';') 7370 ELSE '' 7371 END, 7372 'hgvs=', 7373 {hgvs_column_name} 7374 ) 7375 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 7376 """ 7377 self.execute_query(sql_query_update) 7378 7379 # Add header 7380 HGVS_INFOS = { 7381 "hgvs": { 7382 "ID": "hgvs", 7383 "Number": ".", 7384 "Type": "String", 7385 "Description": f"HGVS annotatation with HOWARD", 7386 } 7387 } 7388 7389 for field in HGVS_INFOS: 7390 field_ID = HGVS_INFOS[field]["ID"] 7391 field_description = HGVS_INFOS[field]["Description"] 7392 self.get_header().infos[field_ID] = vcf.parser._Info( 7393 field_ID, 7394 HGVS_INFOS[field]["Number"], 7395 HGVS_INFOS[field]["Type"], 7396 field_description, 7397 "unknown", 7398 "unknown", 7399 code_type_map[HGVS_INFOS[field]["Type"]], 7400 ) 7401 7402 # Remove added columns 7403 for added_column in added_columns: 7404 self.drop_column(column=added_column)
The annotation_hgvs function performs HGVS annotation on a set of variants using genomic
coordinates and alleles.
Parameters
- threads: The
threadsparameter is an optional integer that specifies the number of threads to use for parallel processing. If no value is provided, it will default to the number of threads obtained from theget_threads()method
7410 def get_operations_help( 7411 self, operations_config_dict: dict = {}, operations_config_file: str = None 7412 ) -> list: 7413 7414 # Init 7415 operations_help = [] 7416 7417 # operations 7418 operations = self.get_config_json( 7419 name="calculations", 7420 config_dict=operations_config_dict, 7421 config_file=operations_config_file, 7422 ) 7423 for op in operations: 7424 op_name = operations[op].get("name", op).upper() 7425 op_description = operations[op].get("description", op_name) 7426 op_available = operations[op].get("available", False) 7427 if op_available: 7428 operations_help.append(f" {op_name}: {op_description}") 7429 7430 # Sort operations 7431 operations_help.sort() 7432 7433 # insert header 7434 operations_help.insert(0, "Available calculation operations:") 7435 7436 # Return 7437 return operations_help
7439 def calculation( 7440 self, 7441 operations: dict = {}, 7442 operations_config_dict: dict = {}, 7443 operations_config_file: str = None, 7444 ) -> None: 7445 """ 7446 It takes a list of operations, and for each operation, it checks if it's a python or sql 7447 operation, and then calls the appropriate function 7448 7449 param json example: 7450 "calculation": { 7451 "NOMEN": { 7452 "options": { 7453 "hgvs_field": "hgvs" 7454 }, 7455 "middle" : null 7456 } 7457 """ 7458 7459 # Param 7460 param = self.get_param() 7461 7462 # operations config 7463 operations_config = self.get_config_json( 7464 name="calculations", 7465 config_dict=operations_config_dict, 7466 config_file=operations_config_file, 7467 ) 7468 7469 # Upper keys 7470 operations_config = {k.upper(): v for k, v in operations_config.items()} 7471 7472 # Calculations 7473 7474 # Operations from param 7475 operations = param.get("calculation", {}).get("calculations", operations) 7476 7477 # Quick calculation - add 7478 if param.get("calculations", None): 7479 calculations_list = [ 7480 value for value in param.get("calculations", "").split(",") 7481 ] 7482 log.info(f"Quick Calculations:") 7483 for calculation_key in calculations_list: 7484 log.info(f" {calculation_key}") 7485 for calculation_operation in calculations_list: 7486 if calculation_operation.upper() not in operations: 7487 operations[calculation_operation.upper()] = {} 7488 add_value_into_dict( 7489 dict_tree=param, 7490 sections=[ 7491 "calculation", 7492 "calculations", 7493 calculation_operation.upper(), 7494 ], 7495 value={}, 7496 ) 7497 7498 # Operations for calculation 7499 if not operations: 7500 operations = param.get("calculation", {}).get("calculations", {}) 7501 7502 if operations: 7503 log.info(f"Calculations...") 7504 7505 # For each operations 7506 for operation_name in operations: 7507 operation_name = operation_name.upper() 7508 if operation_name not in [""]: 7509 if operation_name in operations_config: 7510 log.info(f"Calculation '{operation_name}'") 7511 operation = operations_config[operation_name] 7512 operation_type = operation.get("type", "sql") 7513 if operation_type == "python": 7514 self.calculation_process_function( 7515 operation=operation, operation_name=operation_name 7516 ) 7517 elif operation_type == "sql": 7518 self.calculation_process_sql( 7519 operation=operation, operation_name=operation_name 7520 ) 7521 else: 7522 log.error( 7523 f"Operations config: Type '{operation_type}' NOT available" 7524 ) 7525 raise ValueError( 7526 f"Operations config: Type '{operation_type}' NOT available" 7527 ) 7528 else: 7529 log.error( 7530 f"Operations config: Calculation '{operation_name}' NOT available" 7531 ) 7532 raise ValueError( 7533 f"Operations config: Calculation '{operation_name}' NOT available" 7534 ) 7535 7536 # Explode INFOS fields into table fields 7537 if self.get_explode_infos(): 7538 self.explode_infos( 7539 prefix=self.get_explode_infos_prefix(), 7540 fields=self.get_explode_infos_fields(), 7541 force=True, 7542 )
It takes a list of operations, and for each operation, it checks if it's a python or sql operation, and then calls the appropriate function
param json example: "calculation": { "NOMEN": { "options": { "hgvs_field": "hgvs" }, "middle" : null }
7544 def calculation_process_sql( 7545 self, operation: dict, operation_name: str = "unknown" 7546 ) -> None: 7547 """ 7548 The `calculation_process_sql` function takes in a mathematical operation as a string and 7549 performs the operation, updating the specified table with the result. 7550 7551 :param operation: The `operation` parameter is a dictionary that contains information about the 7552 mathematical operation to be performed. It includes the following keys: 7553 :type operation: dict 7554 :param operation_name: The `operation_name` parameter is a string that represents the name of 7555 the mathematical operation being performed. It is used for logging and error handling purposes, 7556 defaults to unknown 7557 :type operation_name: str (optional) 7558 """ 7559 7560 # table variants 7561 table_variants = self.get_table_variants(clause="alter") 7562 7563 # Operation infos 7564 operation_name = operation.get("name", "unknown") 7565 log.debug(f"process sql {operation_name}") 7566 output_column_name = operation.get("output_column_name", operation_name) 7567 output_column_type = operation.get("output_column_type", "String") 7568 prefix = operation.get("explode_infos_prefix", "") 7569 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 7570 output_column_description = operation.get( 7571 "output_column_description", f"{operation_name} operation" 7572 ) 7573 operation_query = operation.get("operation_query", None) 7574 if isinstance(operation_query, list): 7575 operation_query = " ".join(operation_query) 7576 operation_info_fields = operation.get("info_fields", []) 7577 operation_info_fields_check = operation.get("info_fields_check", False) 7578 operation_info = operation.get("operation_info", True) 7579 7580 if operation_query: 7581 7582 # Info fields check 7583 operation_info_fields_check_result = True 7584 if operation_info_fields_check: 7585 header_infos = self.get_header().infos 7586 for info_field in operation_info_fields: 7587 operation_info_fields_check_result = ( 7588 operation_info_fields_check_result 7589 and info_field in header_infos 7590 ) 7591 7592 # If info fields available 7593 if operation_info_fields_check_result: 7594 7595 # Added_columns 7596 added_columns = [] 7597 7598 # Create VCF header field 7599 vcf_reader = self.get_header() 7600 vcf_reader.infos[output_column_name] = vcf.parser._Info( 7601 output_column_name, 7602 ".", 7603 output_column_type, 7604 output_column_description, 7605 "howard calculation", 7606 "0", 7607 self.code_type_map.get(output_column_type), 7608 ) 7609 7610 # Explode infos if needed 7611 log.debug(f"calculation_process_sql prefix {prefix}") 7612 added_columns += self.explode_infos( 7613 prefix=prefix, 7614 fields=[output_column_name] + operation_info_fields, 7615 force=True, 7616 ) 7617 7618 # Create column 7619 added_column = self.add_column( 7620 table_name=table_variants, 7621 column_name=prefix + output_column_name, 7622 column_type=output_column_type_sql, 7623 default_value="null", 7624 ) 7625 added_columns.append(added_column) 7626 7627 # Operation calculation 7628 try: 7629 7630 # Query to update calculation column 7631 sql_update = f""" 7632 UPDATE {table_variants} 7633 SET "{prefix}{output_column_name}" = ({operation_query}) 7634 """ 7635 self.conn.execute(sql_update) 7636 7637 # Add to INFO 7638 if operation_info: 7639 sql_update_info = f""" 7640 UPDATE {table_variants} 7641 SET "INFO" = 7642 concat( 7643 CASE 7644 WHEN "INFO" IS NOT NULL 7645 THEN concat("INFO", ';') 7646 ELSE '' 7647 END, 7648 '{output_column_name}=', 7649 "{prefix}{output_column_name}" 7650 ) 7651 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 7652 """ 7653 self.conn.execute(sql_update_info) 7654 7655 except: 7656 log.error( 7657 f"Operations config: Calculation '{operation_name}' query failed" 7658 ) 7659 raise ValueError( 7660 f"Operations config: Calculation '{operation_name}' query failed" 7661 ) 7662 7663 # Remove added columns 7664 for added_column in added_columns: 7665 log.debug(f"added_column: {added_column}") 7666 self.drop_column(column=added_column) 7667 7668 else: 7669 log.error( 7670 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7671 ) 7672 raise ValueError( 7673 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7674 ) 7675 7676 else: 7677 log.error( 7678 f"Operations config: Calculation '{operation_name}' query NOT defined" 7679 ) 7680 raise ValueError( 7681 f"Operations config: Calculation '{operation_name}' query NOT defined" 7682 )
The calculation_process_sql function takes in a mathematical operation as a string and
performs the operation, updating the specified table with the result.
Parameters
- operation: The
operationparameter is a dictionary that contains information about the mathematical operation to be performed. It includes the following keys: - operation_name: The
operation_nameparameter is a string that represents the name of the mathematical operation being performed. It is used for logging and error handling purposes, defaults to unknown
7684 def calculation_process_function( 7685 self, operation: dict, operation_name: str = "unknown" 7686 ) -> None: 7687 """ 7688 The `calculation_process_function` takes in an operation dictionary and performs the specified 7689 function with the given parameters. 7690 7691 :param operation: The `operation` parameter is a dictionary that contains information about the 7692 operation to be performed. It has the following keys: 7693 :type operation: dict 7694 :param operation_name: The `operation_name` parameter is a string that represents the name of 7695 the operation being performed. It is used for logging purposes, defaults to unknown 7696 :type operation_name: str (optional) 7697 """ 7698 7699 operation_name = operation["name"] 7700 log.debug(f"process sql {operation_name}") 7701 function_name = operation["function_name"] 7702 function_params = operation["function_params"] 7703 getattr(self, function_name)(*function_params)
The calculation_process_function takes in an operation dictionary and performs the specified
function with the given parameters.
Parameters
- operation: The
operationparameter is a dictionary that contains information about the operation to be performed. It has the following keys: - operation_name: The
operation_nameparameter is a string that represents the name of the operation being performed. It is used for logging purposes, defaults to unknown
7705 def calculation_variant_id(self) -> None: 7706 """ 7707 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 7708 updates the INFO field of a variants table with the variant ID. 7709 """ 7710 7711 # variant_id annotation field 7712 variant_id_tag = self.get_variant_id_column() 7713 added_columns = [variant_id_tag] 7714 7715 # variant_id hgvs tags" 7716 vcf_infos_tags = { 7717 variant_id_tag: "howard variant ID annotation", 7718 } 7719 7720 # Variants table 7721 table_variants = self.get_table_variants() 7722 7723 # Header 7724 vcf_reader = self.get_header() 7725 7726 # Add variant_id to header 7727 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 7728 variant_id_tag, 7729 ".", 7730 "String", 7731 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 7732 "howard calculation", 7733 "0", 7734 self.code_type_map.get("String"), 7735 ) 7736 7737 # Update 7738 sql_update = f""" 7739 UPDATE {table_variants} 7740 SET "INFO" = 7741 concat( 7742 CASE 7743 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 7744 THEN '' 7745 ELSE concat("INFO", ';') 7746 END, 7747 '{variant_id_tag}=', 7748 "{variant_id_tag}" 7749 ) 7750 """ 7751 self.conn.execute(sql_update) 7752 7753 # Remove added columns 7754 for added_column in added_columns: 7755 self.drop_column(column=added_column)
The function calculation_variant_id adds a variant ID annotation to a VCF file header and
updates the INFO field of a variants table with the variant ID.
7757 def calculation_extract_snpeff_hgvs(self) -> None: 7758 """ 7759 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 7760 annotation field in a VCF file and adds them as a new column in the variants table. 7761 """ 7762 7763 # SnpEff annotation field 7764 snpeff_ann = "ANN" 7765 7766 # SnpEff annotation field 7767 snpeff_hgvs = "snpeff_hgvs" 7768 7769 # Snpeff hgvs tags 7770 vcf_infos_tags = { 7771 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 7772 } 7773 7774 # Prefix 7775 prefix = self.get_explode_infos_prefix() 7776 if prefix: 7777 prefix = "INFO/" 7778 7779 # snpEff fields 7780 speff_ann_infos = prefix + snpeff_ann 7781 speff_hgvs_infos = prefix + snpeff_hgvs 7782 7783 # Variants table 7784 table_variants = self.get_table_variants() 7785 7786 # Header 7787 vcf_reader = self.get_header() 7788 7789 # Add columns 7790 added_columns = [] 7791 7792 # Explode HGVS field in column 7793 added_columns += self.explode_infos(fields=[snpeff_ann]) 7794 7795 if "ANN" in vcf_reader.infos: 7796 7797 log.debug(vcf_reader.infos["ANN"]) 7798 7799 # Create variant id 7800 variant_id_column = self.get_variant_id_column() 7801 added_columns += [variant_id_column] 7802 7803 # Create dataframe 7804 dataframe_snpeff_hgvs = self.get_query_to_df( 7805 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 7806 ) 7807 7808 # Create main NOMEN column 7809 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 7810 speff_ann_infos 7811 ].apply(lambda x: extract_snpeff_hgvs(str(x))) 7812 7813 # Add snpeff_hgvs to header 7814 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 7815 snpeff_hgvs, 7816 ".", 7817 "String", 7818 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 7819 "howard calculation", 7820 "0", 7821 self.code_type_map.get("String"), 7822 ) 7823 7824 # Update 7825 sql_update = f""" 7826 UPDATE variants 7827 SET "INFO" = 7828 concat( 7829 CASE 7830 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 7831 THEN '' 7832 ELSE concat("INFO", ';') 7833 END, 7834 CASE 7835 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 7836 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 7837 THEN concat( 7838 '{snpeff_hgvs}=', 7839 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 7840 ) 7841 ELSE '' 7842 END 7843 ) 7844 FROM dataframe_snpeff_hgvs 7845 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 7846 7847 """ 7848 self.conn.execute(sql_update) 7849 7850 # Delete dataframe 7851 del dataframe_snpeff_hgvs 7852 gc.collect() 7853 7854 else: 7855 7856 log.warning( 7857 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 7858 ) 7859 7860 # Remove added columns 7861 for added_column in added_columns: 7862 self.drop_column(column=added_column)
The function calculation_extract_snpeff_hgvs extracts HGVS nomenclatures from the SnpEff
annotation field in a VCF file and adds them as a new column in the variants table.
7864 def calculation_extract_nomen(self) -> None: 7865 """ 7866 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 7867 """ 7868 7869 # NOMEN field 7870 field_nomen_dict = "NOMEN_DICT" 7871 7872 # NOMEN structure 7873 nomen_dict = { 7874 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 7875 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 7876 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 7877 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 7878 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 7879 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 7880 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 7881 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 7882 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 7883 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 7884 } 7885 7886 # Param 7887 param = self.get_param() 7888 7889 # Prefix 7890 prefix = self.get_explode_infos_prefix() 7891 7892 # Header 7893 vcf_reader = self.get_header() 7894 7895 # Get HGVS field 7896 hgvs_field = ( 7897 param.get("calculation", {}) 7898 .get("calculations", {}) 7899 .get("NOMEN", {}) 7900 .get("options", {}) 7901 .get("hgvs_field", "hgvs") 7902 ) 7903 7904 # Get transcripts 7905 transcripts_file = ( 7906 param.get("calculation", {}) 7907 .get("calculations", {}) 7908 .get("NOMEN", {}) 7909 .get("options", {}) 7910 .get("transcripts", None) 7911 ) 7912 transcripts_file = full_path(transcripts_file) 7913 transcripts = [] 7914 if transcripts_file: 7915 if os.path.exists(transcripts_file): 7916 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 7917 transcripts = transcripts_dataframe.iloc[:, 0].tolist() 7918 else: 7919 log.error(f"Transcript file '{transcripts_file}' does NOT exist") 7920 raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist") 7921 7922 # Added columns 7923 added_columns = [] 7924 7925 # Explode HGVS field in column 7926 added_columns += self.explode_infos(fields=[hgvs_field]) 7927 7928 # extra infos 7929 extra_infos = self.get_extra_infos() 7930 extra_field = prefix + hgvs_field 7931 7932 if extra_field in extra_infos: 7933 7934 # Create dataframe 7935 dataframe_hgvs = self.get_query_to_df( 7936 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """ 7937 ) 7938 7939 # Create main NOMEN column 7940 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply( 7941 lambda x: find_nomen(str(x), transcripts=transcripts) 7942 ) 7943 7944 # Explode NOMEN Structure and create SQL set for update 7945 sql_nomen_fields = [] 7946 for nomen_field in nomen_dict: 7947 7948 # Explode each field into a column 7949 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 7950 lambda x: dict(x).get(nomen_field, "") 7951 ) 7952 7953 # Create VCF header field 7954 vcf_reader.infos[nomen_field] = vcf.parser._Info( 7955 nomen_field, 7956 ".", 7957 "String", 7958 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 7959 "howard calculation", 7960 "0", 7961 self.code_type_map.get("String"), 7962 ) 7963 sql_nomen_fields.append( 7964 f""" 7965 CASE 7966 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 7967 THEN concat( 7968 ';{nomen_field}=', 7969 dataframe_hgvs."{nomen_field}" 7970 ) 7971 ELSE '' 7972 END 7973 """ 7974 ) 7975 7976 # SQL set for update 7977 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 7978 7979 # Update 7980 sql_update = f""" 7981 UPDATE variants 7982 SET "INFO" = 7983 concat( 7984 CASE 7985 WHEN "INFO" IS NULL 7986 THEN '' 7987 ELSE "INFO" 7988 END, 7989 {sql_nomen_fields_set} 7990 ) 7991 FROM dataframe_hgvs 7992 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 7993 AND variants."POS" = dataframe_hgvs."POS" 7994 AND variants."REF" = dataframe_hgvs."REF" 7995 AND variants."ALT" = dataframe_hgvs."ALT" 7996 """ 7997 self.conn.execute(sql_update) 7998 7999 # Delete dataframe 8000 del dataframe_hgvs 8001 gc.collect() 8002 8003 # Remove added columns 8004 for added_column in added_columns: 8005 self.drop_column(column=added_column)
This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
8007 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 8008 """ 8009 The function `calculation_find_by_pipeline` performs a calculation to find the number of 8010 pipeline/sample for a variant and updates the variant information in a VCF file. 8011 8012 :param tag: The `tag` parameter is a string that represents the annotation field for the 8013 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 8014 VCF header and to update the corresponding field in the variants table, defaults to 8015 findbypipeline 8016 :type tag: str (optional) 8017 """ 8018 8019 # if FORMAT and samples 8020 if ( 8021 "FORMAT" in self.get_header_columns_as_list() 8022 and self.get_header_sample_list() 8023 ): 8024 8025 # findbypipeline annotation field 8026 findbypipeline_tag = tag 8027 8028 # VCF infos tags 8029 vcf_infos_tags = { 8030 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 8031 } 8032 8033 # Prefix 8034 prefix = self.get_explode_infos_prefix() 8035 8036 # Field 8037 findbypipeline_infos = prefix + findbypipeline_tag 8038 8039 # Variants table 8040 table_variants = self.get_table_variants() 8041 8042 # Header 8043 vcf_reader = self.get_header() 8044 8045 # Create variant id 8046 variant_id_column = self.get_variant_id_column() 8047 added_columns = [variant_id_column] 8048 8049 # variant_id, FORMAT and samples 8050 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8051 self.get_header_sample_list() 8052 ) 8053 8054 # Create dataframe 8055 dataframe_findbypipeline = self.get_query_to_df( 8056 f""" SELECT {samples_fields} FROM {table_variants} """ 8057 ) 8058 8059 # Create findbypipeline column 8060 dataframe_findbypipeline[findbypipeline_infos] = ( 8061 dataframe_findbypipeline.apply( 8062 lambda row: findbypipeline( 8063 row, samples=self.get_header_sample_list() 8064 ), 8065 axis=1, 8066 ) 8067 ) 8068 8069 # Add snpeff_hgvs to header 8070 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 8071 findbypipeline_tag, 8072 ".", 8073 "String", 8074 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 8075 "howard calculation", 8076 "0", 8077 self.code_type_map.get("String"), 8078 ) 8079 8080 # Update 8081 sql_update = f""" 8082 UPDATE variants 8083 SET "INFO" = 8084 concat( 8085 CASE 8086 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8087 THEN '' 8088 ELSE concat("INFO", ';') 8089 END, 8090 CASE 8091 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 8092 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 8093 THEN concat( 8094 '{findbypipeline_tag}=', 8095 dataframe_findbypipeline."{findbypipeline_infos}" 8096 ) 8097 ELSE '' 8098 END 8099 ) 8100 FROM dataframe_findbypipeline 8101 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 8102 """ 8103 self.conn.execute(sql_update) 8104 8105 # Remove added columns 8106 for added_column in added_columns: 8107 self.drop_column(column=added_column) 8108 8109 # Delete dataframe 8110 del dataframe_findbypipeline 8111 gc.collect()
The function calculation_find_by_pipeline performs a calculation to find the number of
pipeline/sample for a variant and updates the variant information in a VCF file.
Parameters
- tag: The
tagparameter is a string that represents the annotation field for the "findbypipeline" information in the VCF file. It is used to create the annotation field in the VCF header and to update the corresponding field in the variants table, defaults to findbypipeline
8113 def calculation_genotype_concordance(self) -> None: 8114 """ 8115 The function `calculation_genotype_concordance` calculates the genotype concordance for 8116 multi-caller VCF files and updates the variant information in the database. 8117 """ 8118 8119 # if FORMAT and samples 8120 if ( 8121 "FORMAT" in self.get_header_columns_as_list() 8122 and self.get_header_sample_list() 8123 ): 8124 8125 # genotypeconcordance annotation field 8126 genotypeconcordance_tag = "genotypeconcordance" 8127 8128 # VCF infos tags 8129 vcf_infos_tags = { 8130 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 8131 } 8132 8133 # Prefix 8134 prefix = self.get_explode_infos_prefix() 8135 8136 # Field 8137 genotypeconcordance_infos = prefix + genotypeconcordance_tag 8138 8139 # Variants table 8140 table_variants = self.get_table_variants() 8141 8142 # Header 8143 vcf_reader = self.get_header() 8144 8145 # Create variant id 8146 variant_id_column = self.get_variant_id_column() 8147 added_columns = [variant_id_column] 8148 8149 # variant_id, FORMAT and samples 8150 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8151 self.get_header_sample_list() 8152 ) 8153 8154 # Create dataframe 8155 dataframe_genotypeconcordance = self.get_query_to_df( 8156 f""" SELECT {samples_fields} FROM {table_variants} """ 8157 ) 8158 8159 # Create genotypeconcordance column 8160 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 8161 dataframe_genotypeconcordance.apply( 8162 lambda row: genotypeconcordance( 8163 row, samples=self.get_header_sample_list() 8164 ), 8165 axis=1, 8166 ) 8167 ) 8168 8169 # Add genotypeconcordance to header 8170 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 8171 genotypeconcordance_tag, 8172 ".", 8173 "String", 8174 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 8175 "howard calculation", 8176 "0", 8177 self.code_type_map.get("String"), 8178 ) 8179 8180 # Update 8181 sql_update = f""" 8182 UPDATE variants 8183 SET "INFO" = 8184 concat( 8185 CASE 8186 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8187 THEN '' 8188 ELSE concat("INFO", ';') 8189 END, 8190 CASE 8191 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 8192 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 8193 THEN concat( 8194 '{genotypeconcordance_tag}=', 8195 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 8196 ) 8197 ELSE '' 8198 END 8199 ) 8200 FROM dataframe_genotypeconcordance 8201 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 8202 """ 8203 self.conn.execute(sql_update) 8204 8205 # Remove added columns 8206 for added_column in added_columns: 8207 self.drop_column(column=added_column) 8208 8209 # Delete dataframe 8210 del dataframe_genotypeconcordance 8211 gc.collect()
The function calculation_genotype_concordance calculates the genotype concordance for
multi-caller VCF files and updates the variant information in the database.
8213 def calculation_barcode(self, tag: str = "barcode") -> None: 8214 """ 8215 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 8216 updates the INFO field in the file with the calculated barcode values. 8217 """ 8218 8219 # if FORMAT and samples 8220 if ( 8221 "FORMAT" in self.get_header_columns_as_list() 8222 and self.get_header_sample_list() 8223 ): 8224 8225 # barcode annotation field 8226 if not tag: 8227 tag = "barcode" 8228 8229 # VCF infos tags 8230 vcf_infos_tags = { 8231 tag: "barcode calculation (VaRank)", 8232 } 8233 8234 # Prefix 8235 prefix = self.get_explode_infos_prefix() 8236 8237 # Field 8238 barcode_infos = prefix + tag 8239 8240 # Variants table 8241 table_variants = self.get_table_variants() 8242 8243 # Header 8244 vcf_reader = self.get_header() 8245 8246 # Create variant id 8247 variant_id_column = self.get_variant_id_column() 8248 added_columns = [variant_id_column] 8249 8250 # variant_id, FORMAT and samples 8251 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8252 self.get_header_sample_list() 8253 ) 8254 8255 # Create dataframe 8256 dataframe_barcode = self.get_query_to_df( 8257 f""" SELECT {samples_fields} FROM {table_variants} """ 8258 ) 8259 8260 # Create barcode column 8261 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8262 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 8263 ) 8264 8265 # Add barcode to header 8266 vcf_reader.infos[tag] = vcf.parser._Info( 8267 tag, 8268 ".", 8269 "String", 8270 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 8271 "howard calculation", 8272 "0", 8273 self.code_type_map.get("String"), 8274 ) 8275 8276 # Update 8277 sql_update = f""" 8278 UPDATE {table_variants} 8279 SET "INFO" = 8280 concat( 8281 CASE 8282 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8283 THEN '' 8284 ELSE concat("INFO", ';') 8285 END, 8286 CASE 8287 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 8288 AND dataframe_barcode."{barcode_infos}" NOT NULL 8289 THEN concat( 8290 '{tag}=', 8291 dataframe_barcode."{barcode_infos}" 8292 ) 8293 ELSE '' 8294 END 8295 ) 8296 FROM dataframe_barcode 8297 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8298 """ 8299 self.conn.execute(sql_update) 8300 8301 # Remove added columns 8302 for added_column in added_columns: 8303 self.drop_column(column=added_column) 8304 8305 # Delete dataframe 8306 del dataframe_barcode 8307 gc.collect()
The calculation_barcode function calculates barcode values for variants in a VCF file and
updates the INFO field in the file with the calculated barcode values.
8309 def calculation_barcode_family(self, tag: str = "BCF") -> None: 8310 """ 8311 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 8312 and updates the INFO field in the file with the calculated barcode values. 8313 8314 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 8315 the barcode tag that will be added to the VCF file during the calculation process. If no value 8316 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 8317 :type tag: str (optional) 8318 """ 8319 8320 # if FORMAT and samples 8321 if ( 8322 "FORMAT" in self.get_header_columns_as_list() 8323 and self.get_header_sample_list() 8324 ): 8325 8326 # barcode annotation field 8327 if not tag: 8328 tag = "BCF" 8329 8330 # VCF infos tags 8331 vcf_infos_tags = { 8332 tag: "barcode family calculation", 8333 f"{tag}S": "barcode family samples", 8334 } 8335 8336 # Param 8337 param = self.get_param() 8338 log.debug(f"param={param}") 8339 8340 # Prefix 8341 prefix = self.get_explode_infos_prefix() 8342 8343 # PED param 8344 ped = ( 8345 param.get("calculation", {}) 8346 .get("calculations", {}) 8347 .get("BARCODEFAMILY", {}) 8348 .get("family_pedigree", None) 8349 ) 8350 log.debug(f"ped={ped}") 8351 8352 # Load PED 8353 if ped: 8354 8355 # Pedigree is a file 8356 if isinstance(ped, str) and os.path.exists(full_path(ped)): 8357 log.debug("Pedigree is file") 8358 with open(full_path(ped)) as ped: 8359 ped = json.load(ped) 8360 8361 # Pedigree is a string 8362 elif isinstance(ped, str): 8363 log.debug("Pedigree is str") 8364 try: 8365 ped = json.loads(ped) 8366 log.debug("Pedigree is json str") 8367 except ValueError as e: 8368 ped_samples = ped.split(",") 8369 ped = {} 8370 for ped_sample in ped_samples: 8371 ped[ped_sample] = ped_sample 8372 8373 # Pedigree is a dict 8374 elif isinstance(ped, dict): 8375 log.debug("Pedigree is dict") 8376 8377 # Pedigree is not well formatted 8378 else: 8379 msg_error = "Pedigree not well formatted" 8380 log.error(msg_error) 8381 raise ValueError(msg_error) 8382 8383 # Construct list 8384 ped_samples = list(ped.values()) 8385 8386 else: 8387 log.debug("Pedigree not defined. Take all samples") 8388 ped_samples = self.get_header_sample_list() 8389 ped = {} 8390 for ped_sample in ped_samples: 8391 ped[ped_sample] = ped_sample 8392 8393 # Check pedigree 8394 if not ped or len(ped) == 0: 8395 msg_error = f"Error in pedigree: samples {ped_samples}" 8396 log.error(msg_error) 8397 raise ValueError(msg_error) 8398 8399 # Log 8400 log.info( 8401 "Calculation 'BARCODEFAMILY' - Samples: " 8402 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 8403 ) 8404 log.debug(f"ped_samples={ped_samples}") 8405 8406 # Field 8407 barcode_infos = prefix + tag 8408 8409 # Variants table 8410 table_variants = self.get_table_variants() 8411 8412 # Header 8413 vcf_reader = self.get_header() 8414 8415 # Create variant id 8416 variant_id_column = self.get_variant_id_column() 8417 added_columns = [variant_id_column] 8418 8419 # variant_id, FORMAT and samples 8420 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8421 ped_samples 8422 ) 8423 8424 # Create dataframe 8425 dataframe_barcode = self.get_query_to_df( 8426 f""" SELECT {samples_fields} FROM {table_variants} """ 8427 ) 8428 8429 # Create barcode column 8430 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8431 lambda row: barcode(row, samples=ped_samples), axis=1 8432 ) 8433 8434 # Add barcode family to header 8435 # Add vaf_normalization to header 8436 vcf_reader.formats[tag] = vcf.parser._Format( 8437 id=tag, 8438 num=".", 8439 type="String", 8440 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 8441 type_code=self.code_type_map.get("String"), 8442 ) 8443 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 8444 id=f"{tag}S", 8445 num=".", 8446 type="String", 8447 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 8448 type_code=self.code_type_map.get("String"), 8449 ) 8450 8451 # Update 8452 # for sample in ped_samples: 8453 sql_update_set = [] 8454 for sample in self.get_header_sample_list() + ["FORMAT"]: 8455 if sample in ped_samples: 8456 value = f'dataframe_barcode."{barcode_infos}"' 8457 value_samples = "'" + ",".join(ped_samples) + "'" 8458 elif sample == "FORMAT": 8459 value = f"'{tag}'" 8460 value_samples = f"'{tag}S'" 8461 else: 8462 value = "'.'" 8463 value_samples = "'.'" 8464 format_regex = r"[a-zA-Z0-9\s]" 8465 sql_update_set.append( 8466 f""" 8467 "{sample}" = 8468 concat( 8469 CASE 8470 WHEN {table_variants}."{sample}" = './.' 8471 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 8472 ELSE {table_variants}."{sample}" 8473 END, 8474 ':', 8475 {value}, 8476 ':', 8477 {value_samples} 8478 ) 8479 """ 8480 ) 8481 8482 sql_update_set_join = ", ".join(sql_update_set) 8483 sql_update = f""" 8484 UPDATE {table_variants} 8485 SET {sql_update_set_join} 8486 FROM dataframe_barcode 8487 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8488 """ 8489 self.conn.execute(sql_update) 8490 8491 # Remove added columns 8492 for added_column in added_columns: 8493 self.drop_column(column=added_column) 8494 8495 # Delete dataframe 8496 del dataframe_barcode 8497 gc.collect()
The calculation_barcode_family function calculates barcode values for variants in a VCF file
and updates the INFO field in the file with the calculated barcode values.
Parameters
- tag: The
tagparameter in thecalculation_barcode_familyfunction is used to specify the barcode tag that will be added to the VCF file during the calculation process. If no value is provided for thetagparameter, the default value used is "BCF", defaults to BCF
8499 def calculation_trio(self) -> None: 8500 """ 8501 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 8502 information to the INFO field of each variant. 8503 """ 8504 8505 # if FORMAT and samples 8506 if ( 8507 "FORMAT" in self.get_header_columns_as_list() 8508 and self.get_header_sample_list() 8509 ): 8510 8511 # trio annotation field 8512 trio_tag = "trio" 8513 8514 # VCF infos tags 8515 vcf_infos_tags = { 8516 "trio": "trio calculation", 8517 } 8518 8519 # Param 8520 param = self.get_param() 8521 8522 # Prefix 8523 prefix = self.get_explode_infos_prefix() 8524 8525 # Trio param 8526 trio_ped = ( 8527 param.get("calculation", {}) 8528 .get("calculations", {}) 8529 .get("TRIO", {}) 8530 .get("trio_pedigree", None) 8531 ) 8532 8533 # Load trio 8534 if trio_ped: 8535 8536 # Trio pedigree is a file 8537 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 8538 log.debug("TRIO pedigree is file") 8539 with open(full_path(trio_ped)) as trio_ped: 8540 trio_ped = json.load(trio_ped) 8541 8542 # Trio pedigree is a string 8543 elif isinstance(trio_ped, str): 8544 log.debug("TRIO pedigree is str") 8545 try: 8546 trio_ped = json.loads(trio_ped) 8547 log.debug("TRIO pedigree is json str") 8548 except ValueError as e: 8549 trio_samples = trio_ped.split(",") 8550 if len(trio_samples) == 3: 8551 trio_ped = { 8552 "father": trio_samples[0], 8553 "mother": trio_samples[1], 8554 "child": trio_samples[2], 8555 } 8556 log.debug("TRIO pedigree is list str") 8557 else: 8558 msg_error = "TRIO pedigree not well formatted" 8559 log.error(msg_error) 8560 raise ValueError(msg_error) 8561 8562 # Trio pedigree is a dict 8563 elif isinstance(trio_ped, dict): 8564 log.debug("TRIO pedigree is dict") 8565 8566 # Trio pedigree is not well formatted 8567 else: 8568 msg_error = "TRIO pedigree not well formatted" 8569 log.error(msg_error) 8570 raise ValueError(msg_error) 8571 8572 # Construct trio list 8573 trio_samples = [ 8574 trio_ped.get("father", ""), 8575 trio_ped.get("mother", ""), 8576 trio_ped.get("child", ""), 8577 ] 8578 8579 else: 8580 log.debug("TRIO pedigree not defined. Take the first 3 samples") 8581 samples_list = self.get_header_sample_list() 8582 if len(samples_list) >= 3: 8583 trio_samples = self.get_header_sample_list()[0:3] 8584 trio_ped = { 8585 "father": trio_samples[0], 8586 "mother": trio_samples[1], 8587 "child": trio_samples[2], 8588 } 8589 else: 8590 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 8591 log.error(msg_error) 8592 raise ValueError(msg_error) 8593 8594 # Check trio pedigree 8595 if not trio_ped or len(trio_ped) != 3: 8596 msg_error = f"Error in TRIO pedigree: {trio_ped}" 8597 log.error(msg_error) 8598 raise ValueError(msg_error) 8599 8600 # Log 8601 log.info( 8602 f"Calculation 'TRIO' - Samples: " 8603 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 8604 ) 8605 8606 # Field 8607 trio_infos = prefix + trio_tag 8608 8609 # Variants table 8610 table_variants = self.get_table_variants() 8611 8612 # Header 8613 vcf_reader = self.get_header() 8614 8615 # Create variant id 8616 variant_id_column = self.get_variant_id_column() 8617 added_columns = [variant_id_column] 8618 8619 # variant_id, FORMAT and samples 8620 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8621 self.get_header_sample_list() 8622 ) 8623 8624 # Create dataframe 8625 dataframe_trio = self.get_query_to_df( 8626 f""" SELECT {samples_fields} FROM {table_variants} """ 8627 ) 8628 8629 # Create trio column 8630 dataframe_trio[trio_infos] = dataframe_trio.apply( 8631 lambda row: trio(row, samples=trio_samples), axis=1 8632 ) 8633 8634 # Add trio to header 8635 vcf_reader.infos[trio_tag] = vcf.parser._Info( 8636 trio_tag, 8637 ".", 8638 "String", 8639 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 8640 "howard calculation", 8641 "0", 8642 self.code_type_map.get("String"), 8643 ) 8644 8645 # Update 8646 sql_update = f""" 8647 UPDATE {table_variants} 8648 SET "INFO" = 8649 concat( 8650 CASE 8651 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8652 THEN '' 8653 ELSE concat("INFO", ';') 8654 END, 8655 CASE 8656 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 8657 AND dataframe_trio."{trio_infos}" NOT NULL 8658 THEN concat( 8659 '{trio_tag}=', 8660 dataframe_trio."{trio_infos}" 8661 ) 8662 ELSE '' 8663 END 8664 ) 8665 FROM dataframe_trio 8666 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 8667 """ 8668 self.conn.execute(sql_update) 8669 8670 # Remove added columns 8671 for added_column in added_columns: 8672 self.drop_column(column=added_column) 8673 8674 # Delete dataframe 8675 del dataframe_trio 8676 gc.collect()
The calculation_trio function performs trio calculations on a VCF file by adding trio
information to the INFO field of each variant.
8678 def calculation_vaf_normalization(self) -> None: 8679 """ 8680 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 8681 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 8682 :return: The function does not return anything. 8683 """ 8684 8685 # if FORMAT and samples 8686 if ( 8687 "FORMAT" in self.get_header_columns_as_list() 8688 and self.get_header_sample_list() 8689 ): 8690 8691 # vaf_normalization annotation field 8692 vaf_normalization_tag = "VAF" 8693 8694 # VCF infos tags 8695 vcf_infos_tags = { 8696 "VAF": "VAF Variant Frequency", 8697 } 8698 8699 # Prefix 8700 prefix = self.get_explode_infos_prefix() 8701 8702 # Variants table 8703 table_variants = self.get_table_variants() 8704 8705 # Header 8706 vcf_reader = self.get_header() 8707 8708 # Do not calculate if VAF already exists 8709 if "VAF" in vcf_reader.formats: 8710 log.debug("VAF already on genotypes") 8711 return 8712 8713 # Create variant id 8714 variant_id_column = self.get_variant_id_column() 8715 added_columns = [variant_id_column] 8716 8717 # variant_id, FORMAT and samples 8718 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8719 self.get_header_sample_list() 8720 ) 8721 8722 # Create dataframe 8723 dataframe_vaf_normalization = self.get_query_to_df( 8724 f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 8725 ) 8726 8727 vaf_normalization_set = [] 8728 8729 # for each sample vaf_normalization 8730 for sample in self.get_header_sample_list(): 8731 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 8732 lambda row: vaf_normalization(row, sample=sample), axis=1 8733 ) 8734 vaf_normalization_set.append( 8735 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 8736 ) 8737 8738 # Add VAF to FORMAT 8739 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 8740 "FORMAT" 8741 ].apply(lambda x: str(x) + ":VAF") 8742 vaf_normalization_set.append( 8743 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 8744 ) 8745 8746 # Add vaf_normalization to header 8747 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 8748 id=vaf_normalization_tag, 8749 num="1", 8750 type="Float", 8751 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 8752 type_code=self.code_type_map.get("Float"), 8753 ) 8754 8755 # Create fields to add in INFO 8756 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 8757 8758 # Update 8759 sql_update = f""" 8760 UPDATE {table_variants} 8761 SET {sql_vaf_normalization_set} 8762 FROM dataframe_vaf_normalization 8763 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 8764 8765 """ 8766 self.conn.execute(sql_update) 8767 8768 # Remove added columns 8769 for added_column in added_columns: 8770 self.drop_column(column=added_column) 8771 8772 # Delete dataframe 8773 del dataframe_vaf_normalization 8774 gc.collect()
The calculation_vaf_normalization function calculates the VAF (Variant Allele Frequency)
normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
Returns
The function does not return anything.
8776 def calculation_genotype_stats(self, info: str = "VAF") -> None: 8777 """ 8778 The `calculation_genotype_stats` function calculates genotype statistics for a given information 8779 field in a VCF file and updates the INFO column of the variants table with the calculated 8780 statistics. 8781 8782 :param info: The `info` parameter is a string that represents the type of information for which 8783 genotype statistics are calculated. It is used to generate various VCF info tags for the 8784 statistics, such as the number of occurrences, the list of values, the minimum value, the 8785 maximum value, the mean, the median, defaults to VAF 8786 :type info: str (optional) 8787 """ 8788 8789 # if FORMAT and samples 8790 if ( 8791 "FORMAT" in self.get_header_columns_as_list() 8792 and self.get_header_sample_list() 8793 ): 8794 8795 # vaf_stats annotation field 8796 vaf_stats_tag = info + "_stats" 8797 8798 # VCF infos tags 8799 vcf_infos_tags = { 8800 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 8801 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 8802 info + "_stats_min": f"genotype {info} Statistics - min {info}", 8803 info + "_stats_max": f"genotype {info} Statistics - max {info}", 8804 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 8805 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 8806 info 8807 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 8808 } 8809 8810 # Prefix 8811 prefix = self.get_explode_infos_prefix() 8812 8813 # Field 8814 vaf_stats_infos = prefix + vaf_stats_tag 8815 8816 # Variants table 8817 table_variants = self.get_table_variants() 8818 8819 # Header 8820 vcf_reader = self.get_header() 8821 8822 # Create variant id 8823 variant_id_column = self.get_variant_id_column() 8824 added_columns = [variant_id_column] 8825 8826 # variant_id, FORMAT and samples 8827 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8828 self.get_header_sample_list() 8829 ) 8830 8831 # Create dataframe 8832 dataframe_vaf_stats = self.get_query_to_df( 8833 f""" SELECT {samples_fields} FROM {table_variants} """ 8834 ) 8835 8836 # Create vaf_stats column 8837 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 8838 lambda row: genotype_stats( 8839 row, samples=self.get_header_sample_list(), info=info 8840 ), 8841 axis=1, 8842 ) 8843 8844 # List of vcf tags 8845 sql_vaf_stats_fields = [] 8846 8847 # Check all VAF stats infos 8848 for stat in vcf_infos_tags: 8849 8850 # Extract stats 8851 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 8852 lambda x: dict(x).get(stat, "") 8853 ) 8854 8855 # Add snpeff_hgvs to header 8856 vcf_reader.infos[stat] = vcf.parser._Info( 8857 stat, 8858 ".", 8859 "String", 8860 vcf_infos_tags.get(stat, "genotype statistics"), 8861 "howard calculation", 8862 "0", 8863 self.code_type_map.get("String"), 8864 ) 8865 8866 if len(sql_vaf_stats_fields): 8867 sep = ";" 8868 else: 8869 sep = "" 8870 8871 # Create fields to add in INFO 8872 sql_vaf_stats_fields.append( 8873 f""" 8874 CASE 8875 WHEN dataframe_vaf_stats."{stat}" NOT NULL 8876 THEN concat( 8877 '{sep}{stat}=', 8878 dataframe_vaf_stats."{stat}" 8879 ) 8880 ELSE '' 8881 END 8882 """ 8883 ) 8884 8885 # SQL set for update 8886 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 8887 8888 # Update 8889 sql_update = f""" 8890 UPDATE variants 8891 SET "INFO" = 8892 concat( 8893 CASE 8894 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8895 THEN '' 8896 ELSE concat("INFO", ';') 8897 END, 8898 {sql_vaf_stats_fields_set} 8899 ) 8900 FROM dataframe_vaf_stats 8901 WHERE variants."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 8902 8903 """ 8904 self.conn.execute(sql_update) 8905 8906 # Remove added columns 8907 for added_column in added_columns: 8908 self.drop_column(column=added_column) 8909 8910 # Delete dataframe 8911 del dataframe_vaf_stats 8912 gc.collect()
The calculation_genotype_stats function calculates genotype statistics for a given information
field in a VCF file and updates the INFO column of the variants table with the calculated
statistics.
Parameters
- info: The
infoparameter is a string that represents the type of information for which genotype statistics are calculated. It is used to generate various VCF info tags for the statistics, such as the number of occurrences, the list of values, the minimum value, the maximum value, the mean, the median, defaults to VAF